From 3c0c995b473c046d8021abb5b34a830ccd94d868 Mon Sep 17 00:00:00 2001 From: "fanny.riols" Date: Fri, 10 Apr 2026 09:56:21 -0400 Subject: [PATCH 01/16] Support multiple output path --- apps/analysis.py | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/apps/analysis.py b/apps/analysis.py index af4b1460..a5b3edaa 100644 --- a/apps/analysis.py +++ b/apps/analysis.py @@ -1752,14 +1752,18 @@ def main(): query_params = st.query_params # Sidebar: output directory selection - st.sidebar.header("Output Directory") + st.sidebar.header("Output Directories") default_output = query_params.get("output_dir", _DEFAULT_OUTPUT_DIR) - output_dir = Path(st.sidebar.text_input("Path to output directory", value=default_output)) + # Normalize comma-separated (from URL query params) to newline-separated (for text area) + if "," in default_output and "\n" not in default_output: + default_output = "\n".join(p.strip() for p in default_output.split(",")) + output_dirs_input = st.sidebar.text_area("Paths to output directories (one per line)", value=default_output) + output_dirs = [Path(p.strip()) for p in output_dirs_input.splitlines() if p.strip()] or [Path(_DEFAULT_OUTPUT_DIR)] - run_dirs = get_run_directories(output_dir) + run_dirs = [rd for od in output_dirs for rd in get_run_directories(od)] if not run_dirs: - st.error(f"No run directories found in {output_dir}") + st.error(f"No run directories found in: {', '.join(str(d) for d in output_dirs)}") return # View mode @@ -1770,17 +1774,19 @@ def main(): view_mode = st.sidebar.radio("View", view_options, index=default_view_idx, label_visibility="collapsed") if view_mode == "Cross-Run Comparison": - render_cross_run_comparison([output_dir / d.name for d in run_dirs], str(output_dir)) + render_cross_run_comparison(run_dirs, ", ".join(str(d) for d in output_dirs)) return # Sidebar: run selection st.sidebar.header("Run Selection") - run_dir_names = [d.name for d in run_dirs] + multiple_output_dirs = len(output_dirs) > 1 + run_dir_labels = [str(d) if multiple_output_dirs else d.name for d in run_dirs] default_run_idx = 0 - if "run" in query_params and query_params["run"] in run_dir_names: - default_run_idx = run_dir_names.index(query_params["run"]) - selected_run_name = st.sidebar.selectbox("Select Run", run_dir_names, index=default_run_idx) - selected_run_dir = output_dir / selected_run_name + if "run" in query_params and query_params["run"] in run_dir_labels: + default_run_idx = run_dir_labels.index(query_params["run"]) + selected_run_label = st.sidebar.selectbox("Select Run", run_dir_labels, index=default_run_idx) + selected_run_dir = run_dirs[run_dir_labels.index(selected_run_label)] + selected_run_name = selected_run_dir.name run_config = _load_run_config(selected_run_dir) if run_config: @@ -1791,9 +1797,9 @@ def main(): if view_mode == "Run Overview": st.query_params.from_dict( { - "output_dir": str(output_dir), + "output_dir": ",".join(str(d) for d in output_dirs), "view": "Run Overview", - "run": selected_run_name, + "run": selected_run_label, } ) render_run_overview(selected_run_dir) @@ -1840,9 +1846,9 @@ def main(): # Update query params for deep linking new_params = { - "output_dir": str(output_dir), + "output_dir": ",".join(str(d) for d in output_dirs), "view": "Record Detail", - "run": selected_run_name, + "run": selected_run_label, "record": selected_record_name, } if selected_trial: From 4b9c0556ca80015ed2957ebe0ae8200621b70238 Mon Sep 17 00:00:00 2001 From: "fanny.riols" Date: Fri, 10 Apr 2026 09:56:27 -0400 Subject: [PATCH 02/16] Add toggle to show only the latest run per system --- apps/analysis.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/apps/analysis.py b/apps/analysis.py index a5b3edaa..35632039 100644 --- a/apps/analysis.py +++ b/apps/analysis.py @@ -12,6 +12,7 @@ import html import json import os +import re from pathlib import Path import pandas as pd @@ -101,6 +102,28 @@ def get_run_directories(output_dir: Path) -> list[Path]: return sorted(run_dirs, key=lambda d: d.name, reverse=True) +def _system_name_from_run(run_dir: Path) -> str: + """Extract the system name from a run folder name (_).""" + m = re.match(r"^\d{4}-\d{2}-\d{2}_\d{2}-\d{2}-\d{2}\.\d+_(.+)$", run_dir.name) + return m.group(1) if m else run_dir.name + + +def filter_latest_runs(run_dirs: list[Path]) -> list[Path]: + """Keep only the most recent run per system name. + + Assumes run_dirs is already sorted newest-first (as returned by + get_run_directories), so the first occurrence of each system name wins. + """ + seen: set[str] = set() + result = [] + for d in run_dirs: + system = _system_name_from_run(d) + if system not in seen: + seen.add(system) + result.append(d) + return result + + def get_record_directories(run_dir: Path) -> list[Path]: """Get all record directories in a run, sorted by record ID.""" records_dir = run_dir / "records" @@ -1762,6 +1785,10 @@ def main(): run_dirs = [rd for od in output_dirs for rd in get_run_directories(od)] + latest_only = st.sidebar.toggle("Latest run per system only", value=True) + if latest_only: + run_dirs = filter_latest_runs(run_dirs) + if not run_dirs: st.error(f"No run directories found in: {', '.join(str(d) for d in output_dirs)}") return From e2f0833a99b332d4a0eeb19122f40df00f357900 Mon Sep 17 00:00:00 2001 From: "fanny.riols" Date: Fri, 10 Apr 2026 10:54:10 -0400 Subject: [PATCH 03/16] Show system name before timestamp in cross-run comparison run labels --- apps/analysis.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/analysis.py b/apps/analysis.py index 35632039..ffcfdd14 100644 --- a/apps/analysis.py +++ b/apps/analysis.py @@ -476,7 +476,7 @@ def _get_run_label(run_name: str, run_config: dict) -> str: suffix = _model_suffix_from_config(run_config) if not suffix or suffix in run_name: return run_name - return f"{run_name} ({suffix})" + return f"{suffix} ({run_name})" def _color_cell(val): From 7a05cf2e50aeb292590a909786de08df3ce7df2e Mon Sep 17 00:00:00 2001 From: "fanny.riols" Date: Fri, 10 Apr 2026 11:03:35 -0400 Subject: [PATCH 04/16] Reformat timestamp-prefixed run names as system_name (timestamp) in labels --- apps/analysis.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/apps/analysis.py b/apps/analysis.py index ffcfdd14..656cef12 100644 --- a/apps/analysis.py +++ b/apps/analysis.py @@ -471,8 +471,15 @@ def _model_suffix_from_config(run_config: dict) -> str: return "_".join(p for p in parts if p) +_TIMESTAMP_RUN_RE = re.compile(r"^(\d{4}-\d{2}-\d{2}_\d{2}-\d{2}-\d{2}\.\d+)_(.+)$") + + def _get_run_label(run_name: str, run_config: dict) -> str: """Build a display label for a run, appending model info if not already in the name.""" + # If the run name is _, reformat as () + m = _TIMESTAMP_RUN_RE.match(run_name) + if m: + return f"{m.group(2)} ({m.group(1)})" suffix = _model_suffix_from_config(run_config) if not suffix or suffix in run_name: return run_name From 4e60c125cd3697eaf6e317c88ed1cce8d901996a Mon Sep 17 00:00:00 2001 From: "fanny.riols" Date: Fri, 10 Apr 2026 11:19:46 -0400 Subject: [PATCH 05/16] Split run label into separate System and Timestamp columns in cross-run comparison table --- apps/analysis.py | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/apps/analysis.py b/apps/analysis.py index 656cef12..96b735a3 100644 --- a/apps/analysis.py +++ b/apps/analysis.py @@ -474,16 +474,21 @@ def _model_suffix_from_config(run_config: dict) -> str: _TIMESTAMP_RUN_RE = re.compile(r"^(\d{4}-\d{2}-\d{2}_\d{2}-\d{2}-\d{2}\.\d+)_(.+)$") -def _get_run_label(run_name: str, run_config: dict) -> str: - """Build a display label for a run, appending model info if not already in the name.""" - # If the run name is _, reformat as () +def _get_system_and_timestamp(run_name: str, run_config: dict) -> tuple[str, str]: + """Return (system_name, timestamp) for a run.""" m = _TIMESTAMP_RUN_RE.match(run_name) if m: - return f"{m.group(2)} ({m.group(1)})" + return m.group(2), m.group(1) suffix = _model_suffix_from_config(run_config) - if not suffix or suffix in run_name: - return run_name - return f"{suffix} ({run_name})" + if suffix and suffix not in run_name: + return f"{suffix} ({run_name})", "" + return run_name, "" + + +def _get_run_label(run_name: str, run_config: dict) -> str: + """Build a display label for a run (used for chart legends).""" + system, timestamp = _get_system_and_timestamp(run_name, run_config) + return f"{system} ({timestamp})" if timestamp else system def _color_cell(val): @@ -973,9 +978,12 @@ def render_cross_run_comparison(run_dirs: list[Path], output_dir_str: str = ""): metric_names = list(per_metric.keys()) all_metric_names.update(metric_names) model_details = _extract_model_details(run_config) + system_name, run_timestamp = _get_system_and_timestamp(run_name, run_config) summary: dict = { "run": run_name, "label": _get_run_label(run_name, run_config), + "system_name": system_name, + "run_timestamp": run_timestamp, "records": metrics_summary.get("total_records", 0), "pipeline_type": _classify_pipeline_type(run_config), **model_details, @@ -997,9 +1005,12 @@ def render_cross_run_comparison(run_dirs: list[Path], output_dir_str: str = ""): all_metric_names.update(metric_names) df = pd.DataFrame(rows) model_details = _extract_model_details(run_config) + system_name, run_timestamp = _get_system_and_timestamp(run_name, run_config) summary = { "run": run_name, "label": _get_run_label(run_name, run_config), + "system_name": system_name, + "run_timestamp": run_timestamp, "records": len(df), "pipeline_type": _classify_pipeline_type(run_config), **model_details, @@ -1060,7 +1071,7 @@ def render_cross_run_comparison(run_dirs: list[Path], output_dir_str: str = ""): # Metrics table: EVA composites first, then all individual metrics table_composites = [c for c in _EVA_BAR_COMPOSITES if c in summary_df.columns] - display_cols = ["label", "records"] + table_composites + ordered_metrics + display_cols = ["system_name", "run_timestamp", "records"] + table_composites + ordered_metrics display_df = summary_df[display_cols].copy() # Add link column to navigate to Run Overview @@ -1072,7 +1083,15 @@ def render_cross_run_comparison(run_dirs: list[Path], output_dir_str: str = ""): ) composite_rename = {c: f"[EVA] {_EVA_COMPOSITE_DISPLAY[c]}" for c in table_composites} - display_df = display_df.rename(columns={"label": "Run", "records": "# Records", **composite_rename, **col_rename}) + display_df = display_df.rename( + columns={ + "system_name": "System", + "run_timestamp": "Timestamp", + "records": "# Records", + **composite_rename, + **col_rename, + } + ) renamed_composites = [composite_rename[c] for c in table_composites] renamed_metrics = [col_rename[m] for m in ordered_metrics] all_score_cols = renamed_composites + renamed_metrics From e9b1c974c504bb70569830e29c50c0ff5e446a6c Mon Sep 17 00:00:00 2001 From: "fanny.riols" Date: Fri, 10 Apr 2026 16:36:31 -0400 Subject: [PATCH 06/16] Fix link generation for multiple output dirs in cross-run comparison table --- apps/analysis.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/apps/analysis.py b/apps/analysis.py index 96b735a3..3994b021 100644 --- a/apps/analysis.py +++ b/apps/analysis.py @@ -981,6 +981,7 @@ def render_cross_run_comparison(run_dirs: list[Path], output_dir_str: str = ""): system_name, run_timestamp = _get_system_and_timestamp(run_name, run_config) summary: dict = { "run": run_name, + "run_output_dir": str(run_dir.parent), "label": _get_run_label(run_name, run_config), "system_name": system_name, "run_timestamp": run_timestamp, @@ -1008,6 +1009,7 @@ def render_cross_run_comparison(run_dirs: list[Path], output_dir_str: str = ""): system_name, run_timestamp = _get_system_and_timestamp(run_name, run_config) summary = { "run": run_name, + "run_output_dir": str(run_dir.parent), "label": _get_run_label(run_name, run_config), "system_name": system_name, "run_timestamp": run_timestamp, @@ -1074,12 +1076,11 @@ def render_cross_run_comparison(run_dirs: list[Path], output_dir_str: str = ""): display_cols = ["system_name", "run_timestamp", "records"] + table_composites + ordered_metrics display_df = summary_df[display_cols].copy() - # Add link column to navigate to Run Overview - output_dir_str = str(run_dirs[0].parent) if run_dirs else "" + # Add link column to navigate to Run Overview (use per-run output dir to support multiple output dirs) display_df.insert( 0, "link", - summary_df["run"].apply(lambda r: f"?output_dir={output_dir_str}&view=Run+Overview&run={r}"), + summary_df.apply(lambda row: f"?output_dir={row['run_output_dir']}&view=Run+Overview&run={row['run']}", axis=1), ) composite_rename = {c: f"[EVA] {_EVA_COMPOSITE_DISPLAY[c]}" for c in table_composites} From b9a267979a9f08480d3229a00ab73f61ad4e5306 Mon Sep 17 00:00:00 2001 From: "fanny.riols" Date: Mon, 13 Apr 2026 16:58:48 -0400 Subject: [PATCH 07/16] Add toggle to hide failed attempts in run overview per-record table Failed attempt rows (from *_failed_attempt_* directories) are now flagged in _collect_run_metrics and filtered out by default. A toggle appears above the table when failed attempts exist, allowing the user to show them. --- apps/analysis.py | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/apps/analysis.py b/apps/analysis.py index 3994b021..efded35e 100644 --- a/apps/analysis.py +++ b/apps/analysis.py @@ -506,13 +506,18 @@ def _color_cell(val): def _collect_run_metrics(run_dir: Path) -> tuple[list[dict], list[str]]: - """Collect all metrics rows for a run. Returns (rows, metric_names).""" + """Collect all metrics rows for a run. Returns (rows, metric_names). + + Rows for failed attempts (directories named *_failed_attempt_*) are marked + with ``_is_failed_attempt=True`` so the caller can filter them. + """ record_dirs = get_record_directories(run_dir) rows: list[dict] = [] all_metric_names: set[str] = set() for record_dir in record_dirs: record_id = record_dir.name + is_failed_attempt = "_failed_attempt_" in record_id data_dirs = _get_record_data_dirs(record_dir) for trial_label, data_path in data_dirs: @@ -520,7 +525,7 @@ def _collect_run_metrics(run_dir: Path) -> tuple[list[dict], list[str]]: if not metrics: continue - row: dict = {"record": record_id} + row: dict = {"record": record_id, "_is_failed_attempt": is_failed_attempt} if trial_label: row["trial"] = trial_label @@ -1264,12 +1269,19 @@ def render_run_overview(run_dir: Path): # --- Per-record table --- st.markdown("### Per-Record Metrics") + has_failed_attempts = df["_is_failed_attempt"].any() + show_failed = False + if has_failed_attempts: + show_failed = st.toggle("Show failed attempts", value=False) + + table_df = df if show_failed else df[~df["_is_failed_attempt"]] + run_name = run_dir.name leading_cols = ["record"] if has_trials: leading_cols.append("trial") - ordered_metrics = [m for m in metric_names if m in df.columns] - df = df[leading_cols + ordered_metrics] + ordered_metrics = [m for m in metric_names if m in table_df.columns] + table_df = table_df[leading_cols + ordered_metrics] # Add link column to navigate to Record Detail def _record_link(row): @@ -1278,12 +1290,12 @@ def _record_link(row): params += f"&trial={row['trial']}" return params - df = df.copy() - df.insert(0, "link", df.apply(_record_link, axis=1)) - df = df.rename(columns=col_rename) + table_df = table_df.copy() + table_df.insert(0, "link", table_df.apply(_record_link, axis=1)) + table_df = table_df.rename(columns=col_rename) renamed_metrics = [col_rename[m] for m in ordered_metrics] - styled = df.style.map(_color_cell, subset=renamed_metrics) + styled = table_df.style.map(_color_cell, subset=renamed_metrics) styled = styled.format(dict.fromkeys(renamed_metrics, "{:.3f}"), na_rep="—") st.dataframe( styled, @@ -1293,7 +1305,7 @@ def _record_link(row): }, ) - csv = df.drop(columns=["link"]).to_csv(index=False) + csv = table_df.drop(columns=["link"]).to_csv(index=False) st.download_button("Download CSV", csv, file_name=f"{run_dir.name}_metrics.csv", mime="text/csv") From 5c4e67cc5ee88010674db0c96a3e153fb1b74f38 Mon Sep 17 00:00:00 2001 From: "fanny.riols" Date: Mon, 13 Apr 2026 16:59:16 -0400 Subject: [PATCH 08/16] Split cross-run comparison table into Accuracy, EVA-X, and Diagnostic sections Adds an Output Dir column showing the parent directory of each run, and splits the single metrics table into three labelled subtables: Accuracy (EVA-A composites + [Accuracy] metrics), EVA-X (EVA-X composites + [Experience] metrics), and Diagnostic & Other (remaining metrics). Each subtable is skipped if it has no data for the current run set. --- apps/analysis.py | 69 ++++++++++++++++++++++++++---------------------- 1 file changed, 37 insertions(+), 32 deletions(-) diff --git a/apps/analysis.py b/apps/analysis.py index efded35e..25260707 100644 --- a/apps/analysis.py +++ b/apps/analysis.py @@ -1076,41 +1076,46 @@ def render_cross_run_comparison(run_dirs: list[Path], output_dir_str: str = ""): ) st.plotly_chart(bar_fig) - # Metrics table: EVA composites first, then all individual metrics - table_composites = [c for c in _EVA_BAR_COMPOSITES if c in summary_df.columns] - display_cols = ["system_name", "run_timestamp", "records"] + table_composites + ordered_metrics - display_df = summary_df[display_cols].copy() - - # Add link column to navigate to Run Overview (use per-run output dir to support multiple output dirs) - display_df.insert( - 0, - "link", - summary_df.apply(lambda row: f"?output_dir={row['run_output_dir']}&view=Run+Overview&run={row['run']}", axis=1), + # Split metrics into three groups by category + eva_a_composites = [c for c in ["EVA-A_pass", "EVA-A_mean"] if c in summary_df.columns] + eva_x_composites = [c for c in ["EVA-X_pass", "EVA-X_mean"] if c in summary_df.columns] + accuracy_metrics = [m for m in ordered_metrics if _METRIC_GROUP.get(m) == "Accuracy"] + experience_metrics = [m for m in ordered_metrics if _METRIC_GROUP.get(m) == "Experience"] + other_metrics = [m for m in ordered_metrics if _METRIC_GROUP.get(m) not in {"Accuracy", "Experience"}] + + id_cols = ["system_name", "run_timestamp", "run_output_dir", "records"] + id_rename = { + "system_name": "System", + "run_timestamp": "Timestamp", + "run_output_dir": "Output Dir", + "records": "# Records", + } + link_series = summary_df.apply( + lambda row: f"?output_dir={row['run_output_dir']}&view=Run+Overview&run={row['run']}", + axis=1, ) - composite_rename = {c: f"[EVA] {_EVA_COMPOSITE_DISPLAY[c]}" for c in table_composites} - display_df = display_df.rename( - columns={ - "system_name": "System", - "run_timestamp": "Timestamp", - "records": "# Records", - **composite_rename, - **col_rename, - } - ) - renamed_composites = [composite_rename[c] for c in table_composites] - renamed_metrics = [col_rename[m] for m in ordered_metrics] - all_score_cols = renamed_composites + renamed_metrics + def _show_subtable(heading: str, composites: list, metrics: list) -> None: + if not composites and not metrics: + return + st.markdown(f"#### {heading}") + composite_rename = {c: f"[EVA] {_EVA_COMPOSITE_DISPLAY[c]}" for c in composites} + cols = id_cols + composites + metrics + sub_df = summary_df[cols].copy() + sub_df.insert(0, "link", link_series) + sub_df = sub_df.rename(columns={**id_rename, **composite_rename, **col_rename}) + score_cols = [composite_rename[c] for c in composites] + [col_rename[m] for m in metrics] + styled = sub_df.style.map(_color_cell, subset=score_cols) + styled = styled.format(dict.fromkeys(score_cols, "{:.3f}"), na_rep="—") + st.dataframe( + styled, + hide_index=True, + column_config={"link": st.column_config.LinkColumn(" ", display_text="🔍", width=40)}, + ) - styled = display_df.style.map(_color_cell, subset=all_score_cols) - styled = styled.format(dict.fromkeys(all_score_cols, "{:.3f}"), na_rep="—") - st.dataframe( - styled, - hide_index=True, - column_config={ - "link": st.column_config.LinkColumn(" ", display_text="🔍", width=40), - }, - ) + _show_subtable("Accuracy Metrics", eva_a_composites, accuracy_metrics) + _show_subtable("EVA-X Metrics", eva_x_composites, experience_metrics) + _show_subtable("Diagnostic & Other Metrics", [], other_metrics) csv = summary_df.drop(columns=["label"]).to_csv(index=False) st.download_button("Download CSV", csv, file_name="cross_run_comparison.csv", mime="text/csv") From 369171bdc9f8efdd5cc260e3a8d5ec2533260ff9 Mon Sep 17 00:00:00 2001 From: "fanny.riols" Date: Mon, 13 Apr 2026 17:06:53 -0400 Subject: [PATCH 09/16] Fix failed attempt detection to check trial label instead of record id --- apps/analysis.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/apps/analysis.py b/apps/analysis.py index 25260707..e98b98df 100644 --- a/apps/analysis.py +++ b/apps/analysis.py @@ -517,7 +517,6 @@ def _collect_run_metrics(run_dir: Path) -> tuple[list[dict], list[str]]: for record_dir in record_dirs: record_id = record_dir.name - is_failed_attempt = "_failed_attempt_" in record_id data_dirs = _get_record_data_dirs(record_dir) for trial_label, data_path in data_dirs: @@ -525,6 +524,7 @@ def _collect_run_metrics(run_dir: Path) -> tuple[list[dict], list[str]]: if not metrics: continue + is_failed_attempt = "_failed_attempt_" in trial_label row: dict = {"record": record_id, "_is_failed_attempt": is_failed_attempt} if trial_label: row["trial"] = trial_label @@ -1113,8 +1113,8 @@ def _show_subtable(heading: str, composites: list, metrics: list) -> None: column_config={"link": st.column_config.LinkColumn(" ", display_text="🔍", width=40)}, ) - _show_subtable("Accuracy Metrics", eva_a_composites, accuracy_metrics) - _show_subtable("EVA-X Metrics", eva_x_composites, experience_metrics) + _show_subtable("Accuracy Metrics (EVA-A)", eva_a_composites, accuracy_metrics) + _show_subtable("Experience Metrics (EVA-X)", eva_x_composites, experience_metrics) _show_subtable("Diagnostic & Other Metrics", [], other_metrics) csv = summary_df.drop(columns=["label"]).to_csv(index=False) From fc49c3ecd02396afd349c7c032b7ed7452146db4 Mon Sep 17 00:00:00 2001 From: "fanny.riols" Date: Tue, 14 Apr 2026 11:08:52 -0400 Subject: [PATCH 10/16] Update analysis app run command to use uv run --- apps/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/README.md b/apps/README.md index 2da58a34..83758ddb 100644 --- a/apps/README.md +++ b/apps/README.md @@ -9,13 +9,13 @@ Interactive dashboard for visualizing and comparing results. ### Usage ```bash -streamlit run apps/analysis.py +uv run streamlit run apps/analysis.py ``` By default, the app looks for runs in the `output/` directory. You can change this in the sidebar or by setting the `EVA_OUTPUT_DIR` environment variable: ```bash -EVA_OUTPUT_DIR=path/to/results streamlit run apps/analysis.py +EVA_OUTPUT_DIR=path/to/results uv run streamlit run apps/analysis.py ``` ### Views From f7f822ac5095b27f590d144ffba67ed85db58b15 Mon Sep 17 00:00:00 2001 From: "fanny.riols" Date: Tue, 14 Apr 2026 11:11:13 -0400 Subject: [PATCH 11/16] Group metrics by category in record detail metrics tab --- apps/analysis.py | 102 ++++++++++++++++++++++++++--------------------- 1 file changed, 56 insertions(+), 46 deletions(-) diff --git a/apps/analysis.py b/apps/analysis.py index e98b98df..88b09cfa 100644 --- a/apps/analysis.py +++ b/apps/analysis.py @@ -1322,54 +1322,64 @@ def render_metrics_tab(metrics: RecordMetrics | None): st.markdown("### Metrics") + # Group metrics by category, preserving insertion order within each group + grouped: dict[str, list[tuple[str, object]]] = {} for metric_name, metric_score in metrics.metrics.items(): - with st.expander( - f"**{metric_name}**: {metric_score.normalized_score:.3f}" - if metric_score.normalized_score is not None - else f"**{metric_name}**" - ): - col1, col2 = st.columns([1, 3]) - - with col1: - st.metric("Score", f"{metric_score.score:.3f}" if metric_score.score is not None else "N/A") - st.metric( - "Normalized", - f"{metric_score.normalized_score:.3f}" if metric_score.normalized_score is not None else "N/A", - ) - if metric_score.error: - st.error(f"Error: {metric_score.error}") - - with col2: - if metric_score.details: - st.markdown("**Details:**") - if "explanation" in metric_score.details: - st.write(metric_score.details["explanation"]) - - if "judge_prompt" in metric_score.details: - with st.expander("View Judge Prompt"): - prompt = metric_score.details["judge_prompt"] - if isinstance(prompt, str): - st.text(prompt) - else: - st.json(prompt) - elif "judge_prompts" in metric_score.details: - with st.expander("View Judge Prompts"): - prompts = metric_score.details["judge_prompts"] - if isinstance(prompts, list): - for i, prompt in enumerate(prompts): - st.markdown(f"**Turn {i + 1}:**") + cat = _METRIC_GROUP.get(metric_name, "Other") + grouped.setdefault(cat, []).append((metric_name, metric_score)) + + for cat in _CATEGORY_ORDER + [c for c in grouped if c not in _CATEGORY_ORDER]: + if cat not in grouped: + continue + st.markdown(f"#### {cat}") + for metric_name, metric_score in grouped[cat]: + with st.expander( + f"**{metric_name}**: {metric_score.normalized_score:.3f}" + if metric_score.normalized_score is not None + else f"**{metric_name}**" + ): + col1, col2 = st.columns([1, 3]) + + with col1: + st.metric("Score", f"{metric_score.score:.3f}" if metric_score.score is not None else "N/A") + st.metric( + "Normalized", + f"{metric_score.normalized_score:.3f}" if metric_score.normalized_score is not None else "N/A", + ) + if metric_score.error: + st.error(f"Error: {metric_score.error}") + + with col2: + if metric_score.details: + st.markdown("**Details:**") + if "explanation" in metric_score.details: + st.write(metric_score.details["explanation"]) + + if "judge_prompt" in metric_score.details: + with st.expander("View Judge Prompt"): + prompt = metric_score.details["judge_prompt"] + if isinstance(prompt, str): st.text(prompt) - st.divider() - else: - st.json(prompts) - - details_to_show = { - k: v - for k, v in metric_score.details.items() - if k not in ["explanation", "judge_prompt", "judge_prompts"] - } - if details_to_show: - st.json(details_to_show) + else: + st.json(prompt) + elif "judge_prompts" in metric_score.details: + with st.expander("View Judge Prompts"): + prompts = metric_score.details["judge_prompts"] + if isinstance(prompts, list): + for i, prompt in enumerate(prompts): + st.markdown(f"**Turn {i + 1}:**") + st.text(prompt) + st.divider() + else: + st.json(prompts) + + details_to_show = { + k: v + for k, v in metric_score.details.items() + if k not in ["explanation", "judge_prompt", "judge_prompts"] + } + if details_to_show: + st.json(details_to_show) def render_processed_data_tab(metrics: RecordMetrics | None): From e8126828b60084ebe335402188774b4b00617aa7 Mon Sep 17 00:00:00 2001 From: "fanny.riols" Date: Tue, 14 Apr 2026 11:18:09 -0400 Subject: [PATCH 12/16] Show dimension scores inline in metrics detail tab --- apps/analysis.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/apps/analysis.py b/apps/analysis.py index 88b09cfa..6df26f19 100644 --- a/apps/analysis.py +++ b/apps/analysis.py @@ -1349,6 +1349,20 @@ def render_metrics_tab(metrics: RecordMetrics | None): if metric_score.error: st.error(f"Error: {metric_score.error}") + # Dimension scores (e.g. faithfulness, conversation_progression) + explanation = metric_score.details.get("explanation") if metric_score.details else None + dimensions = explanation.get("dimensions") if isinstance(explanation, dict) else None + if dimensions: + st.markdown("**Dimensions**") + for dim_name, dim_data in dimensions.items(): + if isinstance(dim_data, dict): + rating = dim_data.get("rating") + flagged = dim_data.get("flagged", False) + label = dim_name.replace("_", " ").title() + score_str = f"{rating}/3" if rating is not None else "N/A" + prefix = "⚠ " if flagged else "" + st.markdown(f"{prefix}**{label}:** {score_str}") + with col2: if metric_score.details: st.markdown("**Details:**") From 327fb88a1db16b7a5193e0927aad74a3c5ea4637 Mon Sep 17 00:00:00 2001 From: "fanny.riols" Date: Tue, 14 Apr 2026 11:24:33 -0400 Subject: [PATCH 13/16] Show metric groups in 4 columns in conversation trace tab --- apps/analysis.py | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/apps/analysis.py b/apps/analysis.py index 6df26f19..d42a6fc5 100644 --- a/apps/analysis.py +++ b/apps/analysis.py @@ -1543,10 +1543,34 @@ def render_conversation_trace_tab(metrics: RecordMetrics | None, record_dir: Pat group = _METRIC_GROUP.get(name, "Other") grouped.setdefault(group, []).append(name) - for group in ["Accuracy", "Experience", "Validation", "Other"]: - names_in_group = grouped.get(group) - if not names_in_group: - continue + # Accuracy / Experience / Diagnostic / Validation side by side in 4 columns + col_groups = ["Accuracy", "Experience", "Diagnostic", "Validation"] + present_col_groups = [g for g in col_groups if grouped.get(g)] + if present_col_groups: + outer_cols = st.columns(len(present_col_groups)) + for outer_col, group in zip(outer_cols, present_col_groups): + names_in_group = grouped[group] + with outer_col: + st.caption(group) + for name in names_in_group: + m = all_top_metrics[name] + score = m["normalized_score"] + display_name = _format_metric_name(name) + score_str = f"{score:.3f}" if score is not None else "N/A" + icon = None if score is None else "🟢" if score >= 0.8 else "🟡" if score >= 0.4 else "🔴" + st.button( + f"{display_name}\n{score_str}", + key=f"metric_btn_{name}", + on_click=st.session_state.update, + kwargs={"selected_metric": None if selected == name else name}, + type="primary" if selected == name else "secondary", + icon=icon, + width="stretch", + ) + + # Any remaining groups (Other, Conversation Quality, etc.) rendered below + for group in [g for g in grouped if g not in col_groups]: + names_in_group = grouped[group] st.caption(group) cols = st.columns(min(len(names_in_group), 5)) for i, name in enumerate(names_in_group): From c7719fffbf51c04b0ddde9860ae7635c41a165ed Mon Sep 17 00:00:00 2001 From: "fanny.riols" Date: Tue, 14 Apr 2026 11:27:12 -0400 Subject: [PATCH 14/16] Fix per-turn breakdown boxes invisible text in dark mode --- apps/analysis.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/analysis.py b/apps/analysis.py index d42a6fc5..81312930 100644 --- a/apps/analysis.py +++ b/apps/analysis.py @@ -1638,7 +1638,7 @@ def render_conversation_trace_tab(metrics: RecordMetrics | None, record_dir: Pat else: color = "#f44336" st.html( - f'
' + f'
' f"Turn {tid_str} — " f'{label} ' f'(latency: {latency_str})' @@ -1661,7 +1661,7 @@ def render_conversation_trace_tab(metrics: RecordMetrics | None, record_dir: Pat color = _score_color(rating) rating_str = f"{rating:.2f}" if isinstance(rating, (int, float)) else str(rating) st.html( - f'
' + f'
' f"Turn {tid_str} — " f'{rating_str}' f"{'
' + html.escape(str(explanation)) + '' if explanation else ''}" From c77a04cccb802ccabf1a2716f39761d1cc2bf52e Mon Sep 17 00:00:00 2001 From: "fanny.riols" Date: Tue, 14 Apr 2026 16:24:26 -0400 Subject: [PATCH 15/16] Fix system/timestamp split for timestamp-only run directory names --- apps/analysis.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/apps/analysis.py b/apps/analysis.py index 5341739e..20be9f0e 100644 --- a/apps/analysis.py +++ b/apps/analysis.py @@ -472,6 +472,7 @@ def _model_suffix_from_config(run_config: dict) -> str: _TIMESTAMP_RUN_RE = re.compile(r"^(\d{4}-\d{2}-\d{2}_\d{2}-\d{2}-\d{2}\.\d+)_(.+)$") +_TIMESTAMP_ONLY_RE = re.compile(r"^(\d{4}-\d{2}-\d{2}_\d{2}-\d{2}-\d{2}\.\d+)$") def _get_system_and_timestamp(run_name: str, run_config: dict) -> tuple[str, str]: @@ -479,6 +480,11 @@ def _get_system_and_timestamp(run_name: str, run_config: dict) -> tuple[str, str m = _TIMESTAMP_RUN_RE.match(run_name) if m: return m.group(2), m.group(1) + # Timestamp-only directory (no system name suffix) — still extract the timestamp + m = _TIMESTAMP_ONLY_RE.match(run_name) + if m: + suffix = _model_suffix_from_config(run_config) + return suffix or "", m.group(1) suffix = _model_suffix_from_config(run_config) if suffix and suffix not in run_name: return f"{suffix} ({run_name})", "" From fa30ec57976439b78a2e044697912d6745dba41d Mon Sep 17 00:00:00 2001 From: "fanny.riols" Date: Wed, 15 Apr 2026 09:05:08 -0400 Subject: [PATCH 16/16] Address PR review comments - Revert uv run from README (not needed with a set Python interpreter) - Hide Output Dir column in cross-run table when only one output dir is in use --- apps/README.md | 4 ++-- apps/analysis.py | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/apps/README.md b/apps/README.md index 83758ddb..2da58a34 100644 --- a/apps/README.md +++ b/apps/README.md @@ -9,13 +9,13 @@ Interactive dashboard for visualizing and comparing results. ### Usage ```bash -uv run streamlit run apps/analysis.py +streamlit run apps/analysis.py ``` By default, the app looks for runs in the `output/` directory. You can change this in the sidebar or by setting the `EVA_OUTPUT_DIR` environment variable: ```bash -EVA_OUTPUT_DIR=path/to/results uv run streamlit run apps/analysis.py +EVA_OUTPUT_DIR=path/to/results streamlit run apps/analysis.py ``` ### Views diff --git a/apps/analysis.py b/apps/analysis.py index 20be9f0e..40516a78 100644 --- a/apps/analysis.py +++ b/apps/analysis.py @@ -1056,7 +1056,8 @@ def render_cross_run_comparison(run_dirs: list[Path]): experience_metrics = [m for m in ordered_metrics if _METRIC_GROUP.get(m) == "Experience"] other_metrics = [m for m in ordered_metrics if _METRIC_GROUP.get(m) not in {"Accuracy", "Experience"}] - id_cols = ["system_name", "run_timestamp", "run_output_dir", "records"] + multiple_output_dirs = summary_df["run_output_dir"].nunique() > 1 + id_cols = ["system_name", "run_timestamp"] + (["run_output_dir"] if multiple_output_dirs else []) + ["records"] id_rename = { "system_name": "System", "run_timestamp": "Timestamp",