From 3c0c995b473c046d8021abb5b34a830ccd94d868 Mon Sep 17 00:00:00 2001
From: "fanny.riols" <fanny.riols@servicenow.com>
Date: Fri, 10 Apr 2026 09:56:21 -0400
Subject: [PATCH 01/16] Support multiple output path

---
 apps/analysis.py | 34 ++++++++++++++++++++--------------
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/apps/analysis.py b/apps/analysis.py
index af4b1460..a5b3edaa 100644
--- a/apps/analysis.py
+++ b/apps/analysis.py
@@ -1752,14 +1752,18 @@ def main():
     query_params = st.query_params
 
     # Sidebar: output directory selection
-    st.sidebar.header("Output Directory")
+    st.sidebar.header("Output Directories")
     default_output = query_params.get("output_dir", _DEFAULT_OUTPUT_DIR)
-    output_dir = Path(st.sidebar.text_input("Path to output directory", value=default_output))
+    # Normalize comma-separated (from URL query params) to newline-separated (for text area)
+    if "," in default_output and "\n" not in default_output:
+        default_output = "\n".join(p.strip() for p in default_output.split(","))
+    output_dirs_input = st.sidebar.text_area("Paths to output directories (one per line)", value=default_output)
+    output_dirs = [Path(p.strip()) for p in output_dirs_input.splitlines() if p.strip()] or [Path(_DEFAULT_OUTPUT_DIR)]
 
-    run_dirs = get_run_directories(output_dir)
+    run_dirs = [rd for od in output_dirs for rd in get_run_directories(od)]
 
     if not run_dirs:
-        st.error(f"No run directories found in {output_dir}")
+        st.error(f"No run directories found in: {', '.join(str(d) for d in output_dirs)}")
         return
 
     # View mode
@@ -1770,17 +1774,19 @@ def main():
     view_mode = st.sidebar.radio("View", view_options, index=default_view_idx, label_visibility="collapsed")
 
     if view_mode == "Cross-Run Comparison":
-        render_cross_run_comparison([output_dir / d.name for d in run_dirs], str(output_dir))
+        render_cross_run_comparison(run_dirs, ", ".join(str(d) for d in output_dirs))
         return
 
     # Sidebar: run selection
     st.sidebar.header("Run Selection")
-    run_dir_names = [d.name for d in run_dirs]
+    multiple_output_dirs = len(output_dirs) > 1
+    run_dir_labels = [str(d) if multiple_output_dirs else d.name for d in run_dirs]
     default_run_idx = 0
-    if "run" in query_params and query_params["run"] in run_dir_names:
-        default_run_idx = run_dir_names.index(query_params["run"])
-    selected_run_name = st.sidebar.selectbox("Select Run", run_dir_names, index=default_run_idx)
-    selected_run_dir = output_dir / selected_run_name
+    if "run" in query_params and query_params["run"] in run_dir_labels:
+        default_run_idx = run_dir_labels.index(query_params["run"])
+    selected_run_label = st.sidebar.selectbox("Select Run", run_dir_labels, index=default_run_idx)
+    selected_run_dir = run_dirs[run_dir_labels.index(selected_run_label)]
+    selected_run_name = selected_run_dir.name
 
     run_config = _load_run_config(selected_run_dir)
     if run_config:
@@ -1791,9 +1797,9 @@ def main():
     if view_mode == "Run Overview":
         st.query_params.from_dict(
             {
-                "output_dir": str(output_dir),
+                "output_dir": ",".join(str(d) for d in output_dirs),
                 "view": "Run Overview",
-                "run": selected_run_name,
+                "run": selected_run_label,
             }
         )
         render_run_overview(selected_run_dir)
@@ -1840,9 +1846,9 @@ def main():
 
     # Update query params for deep linking
     new_params = {
-        "output_dir": str(output_dir),
+        "output_dir": ",".join(str(d) for d in output_dirs),
         "view": "Record Detail",
-        "run": selected_run_name,
+        "run": selected_run_label,
         "record": selected_record_name,
     }
     if selected_trial:

From 4b9c0556ca80015ed2957ebe0ae8200621b70238 Mon Sep 17 00:00:00 2001
From: "fanny.riols" <fanny.riols@servicenow.com>
Date: Fri, 10 Apr 2026 09:56:27 -0400
Subject: [PATCH 02/16] Add toggle to show only the latest run per system

---
 apps/analysis.py | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/apps/analysis.py b/apps/analysis.py
index a5b3edaa..35632039 100644
--- a/apps/analysis.py
+++ b/apps/analysis.py
@@ -12,6 +12,7 @@
 import html
 import json
 import os
+import re
 from pathlib import Path
 
 import pandas as pd
@@ -101,6 +102,28 @@ def get_run_directories(output_dir: Path) -> list[Path]:
     return sorted(run_dirs, key=lambda d: d.name, reverse=True)
 
 
+def _system_name_from_run(run_dir: Path) -> str:
+    """Extract the system name from a run folder name (<timestamp>_<system_name>)."""
+    m = re.match(r"^\d{4}-\d{2}-\d{2}_\d{2}-\d{2}-\d{2}\.\d+_(.+)$", run_dir.name)
+    return m.group(1) if m else run_dir.name
+
+
+def filter_latest_runs(run_dirs: list[Path]) -> list[Path]:
+    """Keep only the most recent run per system name.
+
+    Assumes run_dirs is already sorted newest-first (as returned by
+    get_run_directories), so the first occurrence of each system name wins.
+    """
+    seen: set[str] = set()
+    result = []
+    for d in run_dirs:
+        system = _system_name_from_run(d)
+        if system not in seen:
+            seen.add(system)
+            result.append(d)
+    return result
+
+
 def get_record_directories(run_dir: Path) -> list[Path]:
     """Get all record directories in a run, sorted by record ID."""
     records_dir = run_dir / "records"
@@ -1762,6 +1785,10 @@ def main():
 
     run_dirs = [rd for od in output_dirs for rd in get_run_directories(od)]
 
+    latest_only = st.sidebar.toggle("Latest run per system only", value=True)
+    if latest_only:
+        run_dirs = filter_latest_runs(run_dirs)
+
     if not run_dirs:
         st.error(f"No run directories found in: {', '.join(str(d) for d in output_dirs)}")
         return

From e2f0833a99b332d4a0eeb19122f40df00f357900 Mon Sep 17 00:00:00 2001
From: "fanny.riols" <fanny.riols@servicenow.com>
Date: Fri, 10 Apr 2026 10:54:10 -0400
Subject: [PATCH 03/16] Show system name before timestamp in cross-run
 comparison run labels

---
 apps/analysis.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/apps/analysis.py b/apps/analysis.py
index 35632039..ffcfdd14 100644
--- a/apps/analysis.py
+++ b/apps/analysis.py
@@ -476,7 +476,7 @@ def _get_run_label(run_name: str, run_config: dict) -> str:
     suffix = _model_suffix_from_config(run_config)
     if not suffix or suffix in run_name:
         return run_name
-    return f"{run_name} ({suffix})"
+    return f"{suffix} ({run_name})"
 
 
 def _color_cell(val):

From 7a05cf2e50aeb292590a909786de08df3ce7df2e Mon Sep 17 00:00:00 2001
From: "fanny.riols" <fanny.riols@servicenow.com>
Date: Fri, 10 Apr 2026 11:03:35 -0400
Subject: [PATCH 04/16] Reformat timestamp-prefixed run names as system_name
 (timestamp) in labels

---
 apps/analysis.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/apps/analysis.py b/apps/analysis.py
index ffcfdd14..656cef12 100644
--- a/apps/analysis.py
+++ b/apps/analysis.py
@@ -471,8 +471,15 @@ def _model_suffix_from_config(run_config: dict) -> str:
     return "_".join(p for p in parts if p)
 
 
+_TIMESTAMP_RUN_RE = re.compile(r"^(\d{4}-\d{2}-\d{2}_\d{2}-\d{2}-\d{2}\.\d+)_(.+)$")
+
+
 def _get_run_label(run_name: str, run_config: dict) -> str:
     """Build a display label for a run, appending model info if not already in the name."""
+    # If the run name is <timestamp>_<system_name>, reformat as <system_name> (<timestamp>)
+    m = _TIMESTAMP_RUN_RE.match(run_name)
+    if m:
+        return f"{m.group(2)} ({m.group(1)})"
     suffix = _model_suffix_from_config(run_config)
     if not suffix or suffix in run_name:
         return run_name

From 4e60c125cd3697eaf6e317c88ed1cce8d901996a Mon Sep 17 00:00:00 2001
From: "fanny.riols" <fanny.riols@servicenow.com>
Date: Fri, 10 Apr 2026 11:19:46 -0400
Subject: [PATCH 05/16] Split run label into separate System and Timestamp
 columns in cross-run comparison table

---
 apps/analysis.py | 37 ++++++++++++++++++++++++++++---------
 1 file changed, 28 insertions(+), 9 deletions(-)

diff --git a/apps/analysis.py b/apps/analysis.py
index 656cef12..96b735a3 100644
--- a/apps/analysis.py
+++ b/apps/analysis.py
@@ -474,16 +474,21 @@ def _model_suffix_from_config(run_config: dict) -> str:
 _TIMESTAMP_RUN_RE = re.compile(r"^(\d{4}-\d{2}-\d{2}_\d{2}-\d{2}-\d{2}\.\d+)_(.+)$")
 
 
-def _get_run_label(run_name: str, run_config: dict) -> str:
-    """Build a display label for a run, appending model info if not already in the name."""
-    # If the run name is <timestamp>_<system_name>, reformat as <system_name> (<timestamp>)
+def _get_system_and_timestamp(run_name: str, run_config: dict) -> tuple[str, str]:
+    """Return (system_name, timestamp) for a run."""
     m = _TIMESTAMP_RUN_RE.match(run_name)
     if m:
-        return f"{m.group(2)} ({m.group(1)})"
+        return m.group(2), m.group(1)
     suffix = _model_suffix_from_config(run_config)
-    if not suffix or suffix in run_name:
-        return run_name
-    return f"{suffix} ({run_name})"
+    if suffix and suffix not in run_name:
+        return f"{suffix} ({run_name})", ""
+    return run_name, ""
+
+
+def _get_run_label(run_name: str, run_config: dict) -> str:
+    """Build a display label for a run (used for chart legends)."""
+    system, timestamp = _get_system_and_timestamp(run_name, run_config)
+    return f"{system} ({timestamp})" if timestamp else system
 
 
 def _color_cell(val):
@@ -973,9 +978,12 @@ def render_cross_run_comparison(run_dirs: list[Path], output_dir_str: str = ""):
             metric_names = list(per_metric.keys())
             all_metric_names.update(metric_names)
             model_details = _extract_model_details(run_config)
+            system_name, run_timestamp = _get_system_and_timestamp(run_name, run_config)
             summary: dict = {
                 "run": run_name,
                 "label": _get_run_label(run_name, run_config),
+                "system_name": system_name,
+                "run_timestamp": run_timestamp,
                 "records": metrics_summary.get("total_records", 0),
                 "pipeline_type": _classify_pipeline_type(run_config),
                 **model_details,
@@ -997,9 +1005,12 @@ def render_cross_run_comparison(run_dirs: list[Path], output_dir_str: str = ""):
             all_metric_names.update(metric_names)
             df = pd.DataFrame(rows)
             model_details = _extract_model_details(run_config)
+            system_name, run_timestamp = _get_system_and_timestamp(run_name, run_config)
             summary = {
                 "run": run_name,
                 "label": _get_run_label(run_name, run_config),
+                "system_name": system_name,
+                "run_timestamp": run_timestamp,
                 "records": len(df),
                 "pipeline_type": _classify_pipeline_type(run_config),
                 **model_details,
@@ -1060,7 +1071,7 @@ def render_cross_run_comparison(run_dirs: list[Path], output_dir_str: str = ""):
 
     # Metrics table: EVA composites first, then all individual metrics
     table_composites = [c for c in _EVA_BAR_COMPOSITES if c in summary_df.columns]
-    display_cols = ["label", "records"] + table_composites + ordered_metrics
+    display_cols = ["system_name", "run_timestamp", "records"] + table_composites + ordered_metrics
     display_df = summary_df[display_cols].copy()
 
     # Add link column to navigate to Run Overview
@@ -1072,7 +1083,15 @@ def render_cross_run_comparison(run_dirs: list[Path], output_dir_str: str = ""):
     )
 
     composite_rename = {c: f"[EVA] {_EVA_COMPOSITE_DISPLAY[c]}" for c in table_composites}
-    display_df = display_df.rename(columns={"label": "Run", "records": "# Records", **composite_rename, **col_rename})
+    display_df = display_df.rename(
+        columns={
+            "system_name": "System",
+            "run_timestamp": "Timestamp",
+            "records": "# Records",
+            **composite_rename,
+            **col_rename,
+        }
+    )
     renamed_composites = [composite_rename[c] for c in table_composites]
     renamed_metrics = [col_rename[m] for m in ordered_metrics]
     all_score_cols = renamed_composites + renamed_metrics

From e9b1c974c504bb70569830e29c50c0ff5e446a6c Mon Sep 17 00:00:00 2001
From: "fanny.riols" <fanny.riols@servicenow.com>
Date: Fri, 10 Apr 2026 16:36:31 -0400
Subject: [PATCH 06/16] Fix link generation for multiple output dirs in
 cross-run comparison table

---
 apps/analysis.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/apps/analysis.py b/apps/analysis.py
index 96b735a3..3994b021 100644
--- a/apps/analysis.py
+++ b/apps/analysis.py
@@ -981,6 +981,7 @@ def render_cross_run_comparison(run_dirs: list[Path], output_dir_str: str = ""):
             system_name, run_timestamp = _get_system_and_timestamp(run_name, run_config)
             summary: dict = {
                 "run": run_name,
+                "run_output_dir": str(run_dir.parent),
                 "label": _get_run_label(run_name, run_config),
                 "system_name": system_name,
                 "run_timestamp": run_timestamp,
@@ -1008,6 +1009,7 @@ def render_cross_run_comparison(run_dirs: list[Path], output_dir_str: str = ""):
             system_name, run_timestamp = _get_system_and_timestamp(run_name, run_config)
             summary = {
                 "run": run_name,
+                "run_output_dir": str(run_dir.parent),
                 "label": _get_run_label(run_name, run_config),
                 "system_name": system_name,
                 "run_timestamp": run_timestamp,
@@ -1074,12 +1076,11 @@ def render_cross_run_comparison(run_dirs: list[Path], output_dir_str: str = ""):
     display_cols = ["system_name", "run_timestamp", "records"] + table_composites + ordered_metrics
     display_df = summary_df[display_cols].copy()
 
-    # Add link column to navigate to Run Overview
-    output_dir_str = str(run_dirs[0].parent) if run_dirs else ""
+    # Add link column to navigate to Run Overview (use per-run output dir to support multiple output dirs)
     display_df.insert(
         0,
         "link",
-        summary_df["run"].apply(lambda r: f"?output_dir={output_dir_str}&view=Run+Overview&run={r}"),
+        summary_df.apply(lambda row: f"?output_dir={row['run_output_dir']}&view=Run+Overview&run={row['run']}", axis=1),
     )
 
     composite_rename = {c: f"[EVA] {_EVA_COMPOSITE_DISPLAY[c]}" for c in table_composites}

From b9a267979a9f08480d3229a00ab73f61ad4e5306 Mon Sep 17 00:00:00 2001
From: "fanny.riols" <fanny.riols@servicenow.com>
Date: Mon, 13 Apr 2026 16:58:48 -0400
Subject: [PATCH 07/16] Add toggle to hide failed attempts in run overview
 per-record table

Failed attempt rows (from *_failed_attempt_* directories) are now flagged
in _collect_run_metrics and filtered out by default. A toggle appears above
the table when failed attempts exist, allowing the user to show them.
---
 apps/analysis.py | 30 +++++++++++++++++++++---------
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/apps/analysis.py b/apps/analysis.py
index 3994b021..efded35e 100644
--- a/apps/analysis.py
+++ b/apps/analysis.py
@@ -506,13 +506,18 @@ def _color_cell(val):
 
 
 def _collect_run_metrics(run_dir: Path) -> tuple[list[dict], list[str]]:
-    """Collect all metrics rows for a run. Returns (rows, metric_names)."""
+    """Collect all metrics rows for a run. Returns (rows, metric_names).
+
+    Rows for failed attempts (directories named *_failed_attempt_*) are marked
+    with ``_is_failed_attempt=True`` so the caller can filter them.
+    """
     record_dirs = get_record_directories(run_dir)
     rows: list[dict] = []
     all_metric_names: set[str] = set()
 
     for record_dir in record_dirs:
         record_id = record_dir.name
+        is_failed_attempt = "_failed_attempt_" in record_id
         data_dirs = _get_record_data_dirs(record_dir)
 
         for trial_label, data_path in data_dirs:
@@ -520,7 +525,7 @@ def _collect_run_metrics(run_dir: Path) -> tuple[list[dict], list[str]]:
             if not metrics:
                 continue
 
-            row: dict = {"record": record_id}
+            row: dict = {"record": record_id, "_is_failed_attempt": is_failed_attempt}
             if trial_label:
                 row["trial"] = trial_label
 
@@ -1264,12 +1269,19 @@ def render_run_overview(run_dir: Path):
     # --- Per-record table ---
     st.markdown("### Per-Record Metrics")
 
+    has_failed_attempts = df["_is_failed_attempt"].any()
+    show_failed = False
+    if has_failed_attempts:
+        show_failed = st.toggle("Show failed attempts", value=False)
+
+    table_df = df if show_failed else df[~df["_is_failed_attempt"]]
+
     run_name = run_dir.name
     leading_cols = ["record"]
     if has_trials:
         leading_cols.append("trial")
-    ordered_metrics = [m for m in metric_names if m in df.columns]
-    df = df[leading_cols + ordered_metrics]
+    ordered_metrics = [m for m in metric_names if m in table_df.columns]
+    table_df = table_df[leading_cols + ordered_metrics]
 
     # Add link column to navigate to Record Detail
     def _record_link(row):
@@ -1278,12 +1290,12 @@ def _record_link(row):
             params += f"&trial={row['trial']}"
         return params
 
-    df = df.copy()
-    df.insert(0, "link", df.apply(_record_link, axis=1))
-    df = df.rename(columns=col_rename)
+    table_df = table_df.copy()
+    table_df.insert(0, "link", table_df.apply(_record_link, axis=1))
+    table_df = table_df.rename(columns=col_rename)
 
     renamed_metrics = [col_rename[m] for m in ordered_metrics]
-    styled = df.style.map(_color_cell, subset=renamed_metrics)
+    styled = table_df.style.map(_color_cell, subset=renamed_metrics)
     styled = styled.format(dict.fromkeys(renamed_metrics, "{:.3f}"), na_rep="—")
     st.dataframe(
         styled,
@@ -1293,7 +1305,7 @@ def _record_link(row):
         },
     )
 
-    csv = df.drop(columns=["link"]).to_csv(index=False)
+    csv = table_df.drop(columns=["link"]).to_csv(index=False)
     st.download_button("Download CSV", csv, file_name=f"{run_dir.name}_metrics.csv", mime="text/csv")
 
 

From 5c4e67cc5ee88010674db0c96a3e153fb1b74f38 Mon Sep 17 00:00:00 2001
From: "fanny.riols" <fanny.riols@servicenow.com>
Date: Mon, 13 Apr 2026 16:59:16 -0400
Subject: [PATCH 08/16] Split cross-run comparison table into Accuracy, EVA-X,
 and Diagnostic sections

Adds an Output Dir column showing the parent directory of each run, and
splits the single metrics table into three labelled subtables: Accuracy
(EVA-A composites + [Accuracy] metrics), EVA-X (EVA-X composites +
[Experience] metrics), and Diagnostic & Other (remaining metrics).
Each subtable is skipped if it has no data for the current run set.
---
 apps/analysis.py | 69 ++++++++++++++++++++++++++----------------------
 1 file changed, 37 insertions(+), 32 deletions(-)

diff --git a/apps/analysis.py b/apps/analysis.py
index efded35e..25260707 100644
--- a/apps/analysis.py
+++ b/apps/analysis.py
@@ -1076,41 +1076,46 @@ def render_cross_run_comparison(run_dirs: list[Path], output_dir_str: str = ""):
         )
         st.plotly_chart(bar_fig)
 
-    # Metrics table: EVA composites first, then all individual metrics
-    table_composites = [c for c in _EVA_BAR_COMPOSITES if c in summary_df.columns]
-    display_cols = ["system_name", "run_timestamp", "records"] + table_composites + ordered_metrics
-    display_df = summary_df[display_cols].copy()
-
-    # Add link column to navigate to Run Overview (use per-run output dir to support multiple output dirs)
-    display_df.insert(
-        0,
-        "link",
-        summary_df.apply(lambda row: f"?output_dir={row['run_output_dir']}&view=Run+Overview&run={row['run']}", axis=1),
+    # Split metrics into three groups by category
+    eva_a_composites = [c for c in ["EVA-A_pass", "EVA-A_mean"] if c in summary_df.columns]
+    eva_x_composites = [c for c in ["EVA-X_pass", "EVA-X_mean"] if c in summary_df.columns]
+    accuracy_metrics = [m for m in ordered_metrics if _METRIC_GROUP.get(m) == "Accuracy"]
+    experience_metrics = [m for m in ordered_metrics if _METRIC_GROUP.get(m) == "Experience"]
+    other_metrics = [m for m in ordered_metrics if _METRIC_GROUP.get(m) not in {"Accuracy", "Experience"}]
+
+    id_cols = ["system_name", "run_timestamp", "run_output_dir", "records"]
+    id_rename = {
+        "system_name": "System",
+        "run_timestamp": "Timestamp",
+        "run_output_dir": "Output Dir",
+        "records": "# Records",
+    }
+    link_series = summary_df.apply(
+        lambda row: f"?output_dir={row['run_output_dir']}&view=Run+Overview&run={row['run']}",
+        axis=1,
     )
 
-    composite_rename = {c: f"[EVA] {_EVA_COMPOSITE_DISPLAY[c]}" for c in table_composites}
-    display_df = display_df.rename(
-        columns={
-            "system_name": "System",
-            "run_timestamp": "Timestamp",
-            "records": "# Records",
-            **composite_rename,
-            **col_rename,
-        }
-    )
-    renamed_composites = [composite_rename[c] for c in table_composites]
-    renamed_metrics = [col_rename[m] for m in ordered_metrics]
-    all_score_cols = renamed_composites + renamed_metrics
+    def _show_subtable(heading: str, composites: list, metrics: list) -> None:
+        if not composites and not metrics:
+            return
+        st.markdown(f"#### {heading}")
+        composite_rename = {c: f"[EVA] {_EVA_COMPOSITE_DISPLAY[c]}" for c in composites}
+        cols = id_cols + composites + metrics
+        sub_df = summary_df[cols].copy()
+        sub_df.insert(0, "link", link_series)
+        sub_df = sub_df.rename(columns={**id_rename, **composite_rename, **col_rename})
+        score_cols = [composite_rename[c] for c in composites] + [col_rename[m] for m in metrics]
+        styled = sub_df.style.map(_color_cell, subset=score_cols)
+        styled = styled.format(dict.fromkeys(score_cols, "{:.3f}"), na_rep="—")
+        st.dataframe(
+            styled,
+            hide_index=True,
+            column_config={"link": st.column_config.LinkColumn(" ", display_text="🔍", width=40)},
+        )
 
-    styled = display_df.style.map(_color_cell, subset=all_score_cols)
-    styled = styled.format(dict.fromkeys(all_score_cols, "{:.3f}"), na_rep="—")
-    st.dataframe(
-        styled,
-        hide_index=True,
-        column_config={
-            "link": st.column_config.LinkColumn(" ", display_text="🔍", width=40),
-        },
-    )
+    _show_subtable("Accuracy Metrics", eva_a_composites, accuracy_metrics)
+    _show_subtable("EVA-X Metrics", eva_x_composites, experience_metrics)
+    _show_subtable("Diagnostic & Other Metrics", [], other_metrics)
 
     csv = summary_df.drop(columns=["label"]).to_csv(index=False)
     st.download_button("Download CSV", csv, file_name="cross_run_comparison.csv", mime="text/csv")

From 369171bdc9f8efdd5cc260e3a8d5ec2533260ff9 Mon Sep 17 00:00:00 2001
From: "fanny.riols" <fanny.riols@servicenow.com>
Date: Mon, 13 Apr 2026 17:06:53 -0400
Subject: [PATCH 09/16] Fix failed attempt detection to check trial label
 instead of record id

---
 apps/analysis.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/apps/analysis.py b/apps/analysis.py
index 25260707..e98b98df 100644
--- a/apps/analysis.py
+++ b/apps/analysis.py
@@ -517,7 +517,6 @@ def _collect_run_metrics(run_dir: Path) -> tuple[list[dict], list[str]]:
 
     for record_dir in record_dirs:
         record_id = record_dir.name
-        is_failed_attempt = "_failed_attempt_" in record_id
         data_dirs = _get_record_data_dirs(record_dir)
 
         for trial_label, data_path in data_dirs:
@@ -525,6 +524,7 @@ def _collect_run_metrics(run_dir: Path) -> tuple[list[dict], list[str]]:
             if not metrics:
                 continue
 
+            is_failed_attempt = "_failed_attempt_" in trial_label
             row: dict = {"record": record_id, "_is_failed_attempt": is_failed_attempt}
             if trial_label:
                 row["trial"] = trial_label
@@ -1113,8 +1113,8 @@ def _show_subtable(heading: str, composites: list, metrics: list) -> None:
             column_config={"link": st.column_config.LinkColumn(" ", display_text="🔍", width=40)},
         )
 
-    _show_subtable("Accuracy Metrics", eva_a_composites, accuracy_metrics)
-    _show_subtable("EVA-X Metrics", eva_x_composites, experience_metrics)
+    _show_subtable("Accuracy Metrics (EVA-A)", eva_a_composites, accuracy_metrics)
+    _show_subtable("Experience Metrics (EVA-X)", eva_x_composites, experience_metrics)
     _show_subtable("Diagnostic & Other Metrics", [], other_metrics)
 
     csv = summary_df.drop(columns=["label"]).to_csv(index=False)

From fc49c3ecd02396afd349c7c032b7ed7452146db4 Mon Sep 17 00:00:00 2001
From: "fanny.riols" <fanny.riols@servicenow.com>
Date: Tue, 14 Apr 2026 11:08:52 -0400
Subject: [PATCH 10/16] Update analysis app run command to use uv run

---
 apps/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/apps/README.md b/apps/README.md
index 2da58a34..83758ddb 100644
--- a/apps/README.md
+++ b/apps/README.md
@@ -9,13 +9,13 @@ Interactive dashboard for visualizing and comparing results.
 ### Usage
 
 ```bash
-streamlit run apps/analysis.py
+uv run streamlit run apps/analysis.py
 ```
 
 By default, the app looks for runs in the `output/` directory. You can change this in the sidebar or by setting the `EVA_OUTPUT_DIR` environment variable:
 
 ```bash
-EVA_OUTPUT_DIR=path/to/results streamlit run apps/analysis.py
+EVA_OUTPUT_DIR=path/to/results uv run streamlit run apps/analysis.py
 ```
 
 ### Views

From f7f822ac5095b27f590d144ffba67ed85db58b15 Mon Sep 17 00:00:00 2001
From: "fanny.riols" <fanny.riols@servicenow.com>
Date: Tue, 14 Apr 2026 11:11:13 -0400
Subject: [PATCH 11/16] Group metrics by category in record detail metrics tab

---
 apps/analysis.py | 102 ++++++++++++++++++++++++++---------------------
 1 file changed, 56 insertions(+), 46 deletions(-)

diff --git a/apps/analysis.py b/apps/analysis.py
index e98b98df..88b09cfa 100644
--- a/apps/analysis.py
+++ b/apps/analysis.py
@@ -1322,54 +1322,64 @@ def render_metrics_tab(metrics: RecordMetrics | None):
 
     st.markdown("### Metrics")
 
+    # Group metrics by category, preserving insertion order within each group
+    grouped: dict[str, list[tuple[str, object]]] = {}
     for metric_name, metric_score in metrics.metrics.items():
-        with st.expander(
-            f"**{metric_name}**: {metric_score.normalized_score:.3f}"
-            if metric_score.normalized_score is not None
-            else f"**{metric_name}**"
-        ):
-            col1, col2 = st.columns([1, 3])
-
-            with col1:
-                st.metric("Score", f"{metric_score.score:.3f}" if metric_score.score is not None else "N/A")
-                st.metric(
-                    "Normalized",
-                    f"{metric_score.normalized_score:.3f}" if metric_score.normalized_score is not None else "N/A",
-                )
-                if metric_score.error:
-                    st.error(f"Error: {metric_score.error}")
-
-            with col2:
-                if metric_score.details:
-                    st.markdown("**Details:**")
-                    if "explanation" in metric_score.details:
-                        st.write(metric_score.details["explanation"])
-
-                    if "judge_prompt" in metric_score.details:
-                        with st.expander("View Judge Prompt"):
-                            prompt = metric_score.details["judge_prompt"]
-                            if isinstance(prompt, str):
-                                st.text(prompt)
-                            else:
-                                st.json(prompt)
-                    elif "judge_prompts" in metric_score.details:
-                        with st.expander("View Judge Prompts"):
-                            prompts = metric_score.details["judge_prompts"]
-                            if isinstance(prompts, list):
-                                for i, prompt in enumerate(prompts):
-                                    st.markdown(f"**Turn {i + 1}:**")
+        cat = _METRIC_GROUP.get(metric_name, "Other")
+        grouped.setdefault(cat, []).append((metric_name, metric_score))
+
+    for cat in _CATEGORY_ORDER + [c for c in grouped if c not in _CATEGORY_ORDER]:
+        if cat not in grouped:
+            continue
+        st.markdown(f"#### {cat}")
+        for metric_name, metric_score in grouped[cat]:
+            with st.expander(
+                f"**{metric_name}**: {metric_score.normalized_score:.3f}"
+                if metric_score.normalized_score is not None
+                else f"**{metric_name}**"
+            ):
+                col1, col2 = st.columns([1, 3])
+
+                with col1:
+                    st.metric("Score", f"{metric_score.score:.3f}" if metric_score.score is not None else "N/A")
+                    st.metric(
+                        "Normalized",
+                        f"{metric_score.normalized_score:.3f}" if metric_score.normalized_score is not None else "N/A",
+                    )
+                    if metric_score.error:
+                        st.error(f"Error: {metric_score.error}")
+
+                with col2:
+                    if metric_score.details:
+                        st.markdown("**Details:**")
+                        if "explanation" in metric_score.details:
+                            st.write(metric_score.details["explanation"])
+
+                        if "judge_prompt" in metric_score.details:
+                            with st.expander("View Judge Prompt"):
+                                prompt = metric_score.details["judge_prompt"]
+                                if isinstance(prompt, str):
                                     st.text(prompt)
-                                    st.divider()
-                            else:
-                                st.json(prompts)
-
-                    details_to_show = {
-                        k: v
-                        for k, v in metric_score.details.items()
-                        if k not in ["explanation", "judge_prompt", "judge_prompts"]
-                    }
-                    if details_to_show:
-                        st.json(details_to_show)
+                                else:
+                                    st.json(prompt)
+                        elif "judge_prompts" in metric_score.details:
+                            with st.expander("View Judge Prompts"):
+                                prompts = metric_score.details["judge_prompts"]
+                                if isinstance(prompts, list):
+                                    for i, prompt in enumerate(prompts):
+                                        st.markdown(f"**Turn {i + 1}:**")
+                                        st.text(prompt)
+                                        st.divider()
+                                else:
+                                    st.json(prompts)
+
+                        details_to_show = {
+                            k: v
+                            for k, v in metric_score.details.items()
+                            if k not in ["explanation", "judge_prompt", "judge_prompts"]
+                        }
+                        if details_to_show:
+                            st.json(details_to_show)
 
 
 def render_processed_data_tab(metrics: RecordMetrics | None):

From e8126828b60084ebe335402188774b4b00617aa7 Mon Sep 17 00:00:00 2001
From: "fanny.riols" <fanny.riols@servicenow.com>
Date: Tue, 14 Apr 2026 11:18:09 -0400
Subject: [PATCH 12/16] Show dimension scores inline in metrics detail tab

---
 apps/analysis.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/apps/analysis.py b/apps/analysis.py
index 88b09cfa..6df26f19 100644
--- a/apps/analysis.py
+++ b/apps/analysis.py
@@ -1349,6 +1349,20 @@ def render_metrics_tab(metrics: RecordMetrics | None):
                     if metric_score.error:
                         st.error(f"Error: {metric_score.error}")
 
+                    # Dimension scores (e.g. faithfulness, conversation_progression)
+                    explanation = metric_score.details.get("explanation") if metric_score.details else None
+                    dimensions = explanation.get("dimensions") if isinstance(explanation, dict) else None
+                    if dimensions:
+                        st.markdown("**Dimensions**")
+                        for dim_name, dim_data in dimensions.items():
+                            if isinstance(dim_data, dict):
+                                rating = dim_data.get("rating")
+                                flagged = dim_data.get("flagged", False)
+                                label = dim_name.replace("_", " ").title()
+                                score_str = f"{rating}/3" if rating is not None else "N/A"
+                                prefix = "⚠ " if flagged else ""
+                                st.markdown(f"{prefix}**{label}:** {score_str}")
+
                 with col2:
                     if metric_score.details:
                         st.markdown("**Details:**")

From 327fb88a1db16b7a5193e0927aad74a3c5ea4637 Mon Sep 17 00:00:00 2001
From: "fanny.riols" <fanny.riols@servicenow.com>
Date: Tue, 14 Apr 2026 11:24:33 -0400
Subject: [PATCH 13/16] Show metric groups in 4 columns in conversation trace
 tab

---
 apps/analysis.py | 32 ++++++++++++++++++++++++++++----
 1 file changed, 28 insertions(+), 4 deletions(-)

diff --git a/apps/analysis.py b/apps/analysis.py
index 6df26f19..d42a6fc5 100644
--- a/apps/analysis.py
+++ b/apps/analysis.py
@@ -1543,10 +1543,34 @@ def render_conversation_trace_tab(metrics: RecordMetrics | None, record_dir: Pat
             group = _METRIC_GROUP.get(name, "Other")
             grouped.setdefault(group, []).append(name)
 
-        for group in ["Accuracy", "Experience", "Validation", "Other"]:
-            names_in_group = grouped.get(group)
-            if not names_in_group:
-                continue
+        # Accuracy / Experience / Diagnostic / Validation side by side in 4 columns
+        col_groups = ["Accuracy", "Experience", "Diagnostic", "Validation"]
+        present_col_groups = [g for g in col_groups if grouped.get(g)]
+        if present_col_groups:
+            outer_cols = st.columns(len(present_col_groups))
+            for outer_col, group in zip(outer_cols, present_col_groups):
+                names_in_group = grouped[group]
+                with outer_col:
+                    st.caption(group)
+                    for name in names_in_group:
+                        m = all_top_metrics[name]
+                        score = m["normalized_score"]
+                        display_name = _format_metric_name(name)
+                        score_str = f"{score:.3f}" if score is not None else "N/A"
+                        icon = None if score is None else "🟢" if score >= 0.8 else "🟡" if score >= 0.4 else "🔴"
+                        st.button(
+                            f"{display_name}\n{score_str}",
+                            key=f"metric_btn_{name}",
+                            on_click=st.session_state.update,
+                            kwargs={"selected_metric": None if selected == name else name},
+                            type="primary" if selected == name else "secondary",
+                            icon=icon,
+                            width="stretch",
+                        )
+
+        # Any remaining groups (Other, Conversation Quality, etc.) rendered below
+        for group in [g for g in grouped if g not in col_groups]:
+            names_in_group = grouped[group]
             st.caption(group)
             cols = st.columns(min(len(names_in_group), 5))
             for i, name in enumerate(names_in_group):

From c7719fffbf51c04b0ddde9860ae7635c41a165ed Mon Sep 17 00:00:00 2001
From: "fanny.riols" <fanny.riols@servicenow.com>
Date: Tue, 14 Apr 2026 11:27:12 -0400
Subject: [PATCH 14/16] Fix per-turn breakdown boxes invisible text in dark
 mode

---
 apps/analysis.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/apps/analysis.py b/apps/analysis.py
index d42a6fc5..81312930 100644
--- a/apps/analysis.py
+++ b/apps/analysis.py
@@ -1638,7 +1638,7 @@ def render_conversation_trace_tab(metrics: RecordMetrics | None, record_dir: Pat
                         else:
                             color = "#f44336"
                         st.html(
-                            f'<div style="margin-bottom:8px; padding:6px 10px; background:#fafafa; border-radius:6px; border-left:3px solid {color};">'
+                            f'<div style="margin-bottom:8px; padding:6px 10px; background:rgba(128,128,128,0.1); border-radius:6px; border-left:3px solid {color};">'
                             f"<strong>Turn {tid_str}</strong> — "
                             f'<span style="color:{color}; font-weight:600;">{label}</span> '
                             f'<span style="opacity:0.6;">(latency: {latency_str})</span>'
@@ -1661,7 +1661,7 @@ def render_conversation_trace_tab(metrics: RecordMetrics | None, record_dir: Pat
                         color = _score_color(rating)
                         rating_str = f"{rating:.2f}" if isinstance(rating, (int, float)) else str(rating)
                         st.html(
-                            f'<div style="margin-bottom:8px; padding:6px 10px; background:#fafafa; border-radius:6px; border-left:3px solid {color};">'
+                            f'<div style="margin-bottom:8px; padding:6px 10px; background:rgba(128,128,128,0.1); border-radius:6px; border-left:3px solid {color};">'
                             f"<strong>Turn {tid_str}</strong> — "
                             f'<span style="color:{color}; font-weight:600;">{rating_str}</span>'
                             f"{'<br><span style=font-size:0.88em;opacity:0.75;>' + html.escape(str(explanation)) + '</span>' if explanation else ''}"

From c77a04cccb802ccabf1a2716f39761d1cc2bf52e Mon Sep 17 00:00:00 2001
From: "fanny.riols" <fanny.riols@servicenow.com>
Date: Tue, 14 Apr 2026 16:24:26 -0400
Subject: [PATCH 15/16] Fix system/timestamp split for timestamp-only run
 directory names

---
 apps/analysis.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/apps/analysis.py b/apps/analysis.py
index 5341739e..20be9f0e 100644
--- a/apps/analysis.py
+++ b/apps/analysis.py
@@ -472,6 +472,7 @@ def _model_suffix_from_config(run_config: dict) -> str:
 
 
 _TIMESTAMP_RUN_RE = re.compile(r"^(\d{4}-\d{2}-\d{2}_\d{2}-\d{2}-\d{2}\.\d+)_(.+)$")
+_TIMESTAMP_ONLY_RE = re.compile(r"^(\d{4}-\d{2}-\d{2}_\d{2}-\d{2}-\d{2}\.\d+)$")
 
 
 def _get_system_and_timestamp(run_name: str, run_config: dict) -> tuple[str, str]:
@@ -479,6 +480,11 @@ def _get_system_and_timestamp(run_name: str, run_config: dict) -> tuple[str, str
     m = _TIMESTAMP_RUN_RE.match(run_name)
     if m:
         return m.group(2), m.group(1)
+    # Timestamp-only directory (no system name suffix) — still extract the timestamp
+    m = _TIMESTAMP_ONLY_RE.match(run_name)
+    if m:
+        suffix = _model_suffix_from_config(run_config)
+        return suffix or "", m.group(1)
     suffix = _model_suffix_from_config(run_config)
     if suffix and suffix not in run_name:
         return f"{suffix} ({run_name})", ""

From fa30ec57976439b78a2e044697912d6745dba41d Mon Sep 17 00:00:00 2001
From: "fanny.riols" <fanny.riols@servicenow.com>
Date: Wed, 15 Apr 2026 09:05:08 -0400
Subject: [PATCH 16/16] Address PR review comments

- Revert uv run from README (not needed with a set Python interpreter)
- Hide Output Dir column in cross-run table when only one output dir is in use
---
 apps/README.md   | 4 ++--
 apps/analysis.py | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/apps/README.md b/apps/README.md
index 83758ddb..2da58a34 100644
--- a/apps/README.md
+++ b/apps/README.md
@@ -9,13 +9,13 @@ Interactive dashboard for visualizing and comparing results.
 ### Usage
 
 ```bash
-uv run streamlit run apps/analysis.py
+streamlit run apps/analysis.py
 ```
 
 By default, the app looks for runs in the `output/` directory. You can change this in the sidebar or by setting the `EVA_OUTPUT_DIR` environment variable:
 
 ```bash
-EVA_OUTPUT_DIR=path/to/results uv run streamlit run apps/analysis.py
+EVA_OUTPUT_DIR=path/to/results streamlit run apps/analysis.py
 ```
 
 ### Views
diff --git a/apps/analysis.py b/apps/analysis.py
index 20be9f0e..40516a78 100644
--- a/apps/analysis.py
+++ b/apps/analysis.py
@@ -1056,7 +1056,8 @@ def render_cross_run_comparison(run_dirs: list[Path]):
     experience_metrics = [m for m in ordered_metrics if _METRIC_GROUP.get(m) == "Experience"]
     other_metrics = [m for m in ordered_metrics if _METRIC_GROUP.get(m) not in {"Accuracy", "Experience"}]
 
-    id_cols = ["system_name", "run_timestamp", "run_output_dir", "records"]
+    multiple_output_dirs = summary_df["run_output_dir"].nunique() > 1
+    id_cols = ["system_name", "run_timestamp"] + (["run_output_dir"] if multiple_output_dirs else []) + ["records"]
     id_rename = {
         "system_name": "System",
         "run_timestamp": "Timestamp",