OpenMS · t0mdavid-m · May 28, 2026 · May 28, 2026 · May 28, 2026
diff --git a/src/parse/tag_resolution.py b/src/parse/tag_resolution.py
@@ -0,0 +1,41 @@
+import pandas as pd
+
+
+def _split_ints(s):
+    return [int(x) for x in str(s).replace(';', ' ').split()
+            if x.strip().lstrip('-').isdigit()]
+
+
+def build_tagspace_to_proteoform_map(raw_tag_df, raw_protein_df):
+    """Map FLASHTagger tag-space ProteoformIndex -> protein-space index.
+
+    tags.tsv enumerates all candidate proteoforms (tag-space, incl. decoys);
+    protein.tsv enumerates surviving proteoforms (protein-space). The bridge is
+    protein.tsv.TagIndices crossed with tags.tsv.ProteoformIndex; the relation
+    is a strictly monotonic bijection, resolved by greedy monotonic assignment
+    over proteoforms in ascending protein-space order. Returns
+    {tag_space_index: protein_space_index}; tag-space indices with no surviving
+    proteoform are absent (callers map them to -1).
+    """
+    ti_to_qset = {
+        int(ti): set(_split_ints(pis))
+        for ti, pis in zip(raw_tag_df['TagIndex'], raw_tag_df['ProteoformIndex'])
+    }
+    q_to_p = {}
+    prev_q = -1
+    ordered = raw_protein_df.sort_values('ProteoformIndex')
+    for p, tagidx in zip(ordered['ProteoformIndex'].astype(int), ordered['TagIndices']):
+        cand = None
+        for t in _split_ints(tagidx):
+            s = ti_to_qset.get(t, set())
+            cand = s if cand is None else (cand & s)
+        if not cand:                          # empty intersection -> union fallback
+            cand = set()
+            for t in _split_ints(tagidx):
+                cand |= ti_to_qset.get(t, set())
+        nxt = [q for q in sorted(cand) if q > prev_q]
+        if not nxt:
+            continue
+        q_to_p[nxt[0]] = int(p)
+        prev_q = nxt[0]
+    return q_to_p
diff --git a/src/parse/tnt.py b/src/parse/tnt.py
@@ -2,12 +2,15 @@
 
 import numpy as np
 import pandas as pd
+import pyarrow.parquet as pq
+from src.render.sequence_data_store import build_table, ROW_GROUP_SIZE
 
 from io import StringIO
 from pyopenms import AASequence
 from scipy.stats import gaussian_kde
 
 from src.parse.masstable import parseFLASHTaggerOutput
+from src.parse.tag_resolution import build_tagspace_to_proteoform_map
 from src.render.sequence import (
     remove_ambigious, getFragmentDataFromSeq, getInternalFragmentDataFromSeq
 )
@@ -66,6 +69,9 @@ def parseTnT(file_manager, dataset_id, deconv_mzML, anno_mzML, tag_tsv, protein_
 
     tolerance = file_manager.get_results(dataset_id, ['deconv_tolerance'])['deconv_tolerance']
     tag_df, protein_df = parseFLASHTaggerOutput(tag_tsv, protein_tsv)
+    # Map FLASHTagger tag-space ProteoformIndex -> protein-space index from the
+    # raw frames (before protein_df is renamed and tag_df is linearized).
+    tagspace_to_proteoform = build_tagspace_to_proteoform_map(tag_df, protein_df)
     logger.log("10.0 %", level=2)
 
     # protein_table
@@ -107,10 +113,14 @@ def parseTnT(file_manager, dataset_id, deconv_mzML, anno_mzML, tag_tsv, protein_
     sequence_data = {}
     # internal_fragment_data = {}  # Disabled
     # Compute coverage
-    # Group tag ranges by proteoform once (StartPos/EndPos already shifted above).
+    # tag_df['ProteinIndex'] is tag-space; map to protein-space so coverage uses
+    # each proteoform's own tags (the two enumerations diverge on large runs).
+    proteoform_of_tag = tag_df['ProteinIndex'].map(
+        lambda q: tagspace_to_proteoform.get(int(q), -1) if pd.notna(q) else -1
+    )
     tag_groups = {
         pid: (g['StartPos'].to_numpy(), g['EndPos'].to_numpy())
-        for pid, g in tag_df.groupby('ProteinIndex')[['StartPos', 'EndPos']]
+        for pid, g in tag_df.groupby(proteoform_of_tag)[['StartPos', 'EndPos']]
     }
     for i, row in protein_df.iterrows():
         pid = row['index']
@@ -173,7 +183,9 @@ def parseTnT(file_manager, dataset_id, deconv_mzML, anno_mzML, tag_tsv, protein_
         #     str(sequence)[start_index:end_index+1], modifications
         # )  # Disabled
 
-    file_manager.store_data(dataset_id, 'sequence_data', sequence_data)
+    sequence_data_table = build_table(sequence_data)
+    with file_manager.parquet_sink(dataset_id, 'sequence_data') as sequence_data_path:
+        pq.write_table(sequence_data_table, sequence_data_path, row_group_size=ROW_GROUP_SIZE)
     # file_manager.store_data(
     #     dataset_id, 'internal_fragment_data', internal_fragment_data
     # )  # Disabled

diff --git a/src/render/initialize.py b/src/render/initialize.py
@@ -6,6 +6,29 @@
     FDRPlotly, FLASHQuant
 )
 from src.render.compression import compute_compression_levels
+from src.render.scan_resolution import build_proteoform_scan_map
+
+
+def _attach_proteoform_scan_map(file_manager, selected_data, additional_data):
+    protein_df = file_manager.get_results(selected_data, ['protein_dfs'])['protein_dfs']
+    scan_table_df = file_manager.get_results(selected_data, ['scan_table'])['scan_table']
+    additional_data['proteoform_scan_map'] = build_proteoform_scan_map(
+        protein_df[['index', 'Scan']], scan_table_df[['index', 'Scan']]
+    )
+
+
+def _load_scan_scoped(file_manager, selected_data, cache_name, tool, additional_data):
+    """Eager-load the cache once (cached in session_state by the caller). For
+    flashtnt also attach the proteoform->scan map so filter_data can slice the
+    selected proteoform's scan in memory -- matching the FLASHDeconv path, which
+    loads once and slices with iloc. (A per-click pyarrow pushdown was tried but
+    re-read the whole file every click: the per-scan caches are written as a
+    single parquet row group, so pushdown cannot skip rows.)"""
+    result = file_manager.get_results(selected_data, [cache_name])
+    if tool == 'flashtnt':
+        _attach_proteoform_scan_map(file_manager, selected_data, additional_data)
+    return result[cache_name]
+
 
 def initialize_data(comp_name, selected_data, file_manager, tool):
 
@@ -108,31 +131,31 @@ def initialize_data(comp_name, selected_data, file_manager, tool):
         data_to_send['per_scan_data'] = data['scan_table']
         component_arguments = Tabulator('ScanTable')
     elif comp_name == 'deconv_spectrum':
-        data = file_manager.get_results(selected_data, ['deconv_spectrum'])
-        data_to_send['per_scan_data'] = data['deconv_spectrum']
+        data_to_send['per_scan_data'] = _load_scan_scoped(
+            file_manager, selected_data, 'deconv_spectrum', tool, additional_data)
         component_arguments = PlotlyLineplot(title="Deconvolved Spectrum")
     elif comp_name == 'combined_spectrum':
-        data = file_manager.get_results(selected_data, ['combined_spectrum'])
-        data_to_send['per_scan_data'] = data['combined_spectrum']
+        data_to_send['per_scan_data'] = _load_scan_scoped(
+            file_manager, selected_data, 'combined_spectrum', tool, additional_data)
         component_arguments = PlotlyLineplotTagger(title="Augmented Deconvolved Spectrum")
     elif comp_name == 'anno_spectrum':
-        data = file_manager.get_results(selected_data,  ['combined_spectrum'])
-        data_to_send['per_scan_data'] = data['combined_spectrum']
+        data_to_send['per_scan_data'] = _load_scan_scoped(
+            file_manager, selected_data, 'combined_spectrum', tool, additional_data)
         component_arguments = PlotlyLineplot(title="Annotated Spectrum")
     elif comp_name == 'mass_table':
-        data = file_manager.get_results(selected_data,  ['mass_table'])
-        data_to_send['per_scan_data'] = data['mass_table']
+        data_to_send['per_scan_data'] = _load_scan_scoped(
+            file_manager, selected_data, 'mass_table', tool, additional_data)
         component_arguments = Tabulator('MassTable')
     elif comp_name == '3D_SN_plot':
         data = file_manager.get_results(selected_data,  ['threedim_SN_plot'], use_pyarrow=True)
         data_to_send['per_scan_data'] = data['threedim_SN_plot']
         component_arguments = Plotly3Dplot(title="Precursor Signals")
     elif comp_name == 'sequence_view':
-        data = file_manager.get_results(selected_data,  ['sequence_view'])
-        data_to_send['per_scan_data'] = data['sequence_view']
+        data_to_send['per_scan_data'] = _load_scan_scoped(
+            file_manager, selected_data, 'sequence_view', tool, additional_data)
         if tool == 'flashtnt':
-            data = file_manager.get_results(selected_data,  ['sequence_data'])
-            data_to_send['sequence_data'] = data['sequence_data']
+            seq = file_manager.get_results(selected_data, ['sequence_data'], use_pyarrow=True)
+            additional_data['sequence_data_ds'] = seq['sequence_data']
             data = file_manager.get_results(selected_data,  ['settings'])
             data_to_send['settings'] = data['settings']
         component_arguments = SequenceView(title='Sequence View')
@@ -165,8 +188,8 @@ def initialize_data(comp_name, selected_data, file_manager, tool):
         data_to_send['protein_table'] = data['protein_dfs']
         component_arguments = Tabulator('ProteinTable')
     elif comp_name == 'tag_table':
-        data = file_manager.get_results(selected_data,  ['tag_dfs'])
-        data_to_send['tag_table'] = data['tag_dfs']
+        data_to_send['tag_table'] = _load_scan_scoped(
+            file_manager, selected_data, 'tag_dfs', tool, additional_data)
         component_arguments = Tabulator('TagTable')
     elif comp_name == 'quant_visualization':
         data = file_manager.get_results(selected_data,  ['quant_dfs'])

diff --git a/src/render/scan_resolution.py b/src/render/scan_resolution.py
@@ -0,0 +1,27 @@
+import pandas as pd
+
+
+def build_proteoform_scan_map(protein_df, scan_table_df):
+    """Map each proteoform index to its scan and the deconv row index.
+
+    protein_df: DataFrame with 'index' (proteoform index) and 'Scan'.
+    scan_table_df: DataFrame with 'index' (deconv row index) and 'Scan'.
+
+    Returns {proteoform_index: {'scan': int, 'deconv_index': int}}.
+    Proteoforms whose Scan is NaN or absent from scan_table are omitted.
+    """
+    scan_to_index = (
+        scan_table_df.drop_duplicates(subset="Scan", keep="first")
+        .set_index("Scan")["index"]
+    )
+    result = {}
+    for proteoform_index, scan in zip(protein_df["index"], protein_df["Scan"]):
+        if pd.isna(scan):
+            continue
+        scan = int(scan)
+        if scan in scan_to_index.index:
+            result[int(proteoform_index)] = {
+                "scan": scan,
+                "deconv_index": int(scan_to_index.loc[scan]),
+            }
+    return result
diff --git a/src/render/sequence_data_store.py b/src/render/sequence_data_store.py
@@ -0,0 +1,90 @@
+import pyarrow as pa
+import pyarrow.dataset as ds
+import pyarrow.parquet as pq
+
+# One row per proteoform. Explicit schema so the always-empty
+# fixed_modifications and the empty/variable modifications get consistent types.
+SCHEMA = pa.schema([
+    ("proteoform_index", pa.int64()),
+    ("sequence", pa.list_(pa.string())),
+    ("theoretical_mass", pa.float64()),
+    ("fixed_modifications", pa.list_(pa.string())),
+    ("coverage", pa.list_(pa.float64())),
+    ("maxCoverage", pa.float64()),
+    ("fragment_masses_a", pa.list_(pa.list_(pa.float64()))),
+    ("fragment_masses_b", pa.list_(pa.list_(pa.float64()))),
+    ("fragment_masses_c", pa.list_(pa.list_(pa.float64()))),
+    ("fragment_masses_x", pa.list_(pa.list_(pa.float64()))),
+    ("fragment_masses_y", pa.list_(pa.list_(pa.float64()))),
+    ("fragment_masses_z", pa.list_(pa.list_(pa.float64()))),
+    ("proteoform_start", pa.int64()),
+    ("proteoform_end", pa.int64()),
+    ("computed_mass", pa.float64()),
+    ("modifications", pa.list_(pa.struct([
+        ("start", pa.int64()), ("end", pa.int64()),
+        ("mass_diff", pa.float64()), ("labels", pa.string()),
+    ]))),
+])
+
+ROW_GROUP_SIZE = 64
+ENTRY_KEYS = [f.name for f in SCHEMA if f.name != "proteoform_index"]
+
+
+def _py(x):
+    """Recursively convert numpy scalars to builtins so pa.Table.from_pylist
+    serializes cleanly (coverage/maxCoverage are np.float64)."""
+    import numpy as np
+    if isinstance(x, np.generic):
+        return x.item()
+    if isinstance(x, list):
+        return [_py(v) for v in x]
+    if isinstance(x, dict):
+        return {k: _py(v) for k, v in x.items()}
+    return x
+
+
+def build_table(sequence_data):
+    """{proteoform_index: entry} -> pyarrow Table, one row per proteoform,
+    sorted by proteoform_index (so row groups carry contiguous index ranges
+    and pushdown can skip)."""
+    rows = []
+    for pid in sorted(sequence_data):
+        entry = sequence_data[pid]
+        row = {"proteoform_index": int(pid)}
+        for k in ENTRY_KEYS:
+            row[k] = _py(entry[k])
+        rows.append(row)
+    return pa.Table.from_pylist(rows, schema=SCHEMA)
+
+
+def _as_dataset(dataset_or_path):
+    if isinstance(dataset_or_path, ds.Dataset):
+        return dataset_or_path
+    return ds.dataset(str(dataset_or_path), format="parquet")
+
+
+def load_entry(dataset_or_path, proteoform_index):
+    """Pushdown-read one proteoform's row; return its entry dict (native Python
+    containers via to_pylist) with proteoform_index removed, or None if absent."""
+    dataset = _as_dataset(dataset_or_path)
+    table = dataset.to_table(filter=ds.field("proteoform_index") == int(proteoform_index))
+    rows = table.to_pylist()
+    if not rows:
+        return None
+    entry = rows[0]
+    entry.pop("proteoform_index", None)
+    return entry
+
+
+def reconstruct_all(dataset_or_path):
+    """Read every row -> {proteoform_index: entry}. For migration verification
+    and the golden adapter only; never the hot render path."""
+    if isinstance(dataset_or_path, ds.Dataset):
+        table = dataset_or_path.to_table()
+    else:
+        table = pq.read_table(str(dataset_or_path))
+    out = {}
+    for row in table.to_pylist():
+        pid = row.pop("proteoform_index")
+        out[pid] = row
+    return out
diff --git a/src/render/update.py b/src/render/update.py
@@ -7,6 +7,7 @@
 from src.workflow.FileManager import FileManager
 from src.render.sequence import getFragmentDataFromSeq, getInternalFragmentDataFromSeq
 from pathlib import Path
+from src.render.sequence_data_store import load_entry
 
 
 def get_sequence(selection_store):
@@ -117,7 +118,15 @@ def filter_data(data, out_components, selection_store, additional_data, tool):
         'Augmented Deconvolved Spectrum', 
         'Mass Table', 'Sequence View', 'Internal Fragment Map'
     ]:
-        if 'scanIndex' not in selection_store:
+        if tool == 'flashtnt':
+            scan_map = additional_data.get('proteoform_scan_map', {})
+            entry = scan_map.get(selection_store.get('proteinIndex'))
+            if entry is None:
+                data['per_scan_data'] = data['per_scan_data'].iloc[0:0, :]
+            else:
+                per_scan = data['per_scan_data']
+                data['per_scan_data'] = per_scan[per_scan['index'] == entry['deconv_index']]
+        elif 'scanIndex' not in selection_store:
             data['per_scan_data'] = data['per_scan_data'].iloc[0:0,:]
         else:
             data['per_scan_data'] = data['per_scan_data'].iloc[selection_store['scanIndex']:selection_store['scanIndex']+1,:]
@@ -159,23 +168,34 @@ def filter_data(data, out_components, selection_store, additional_data, tool):
         else:
             selected_data = selection_store[selection]
         data['raw_heatmap_df'] = render_heatmap(
-            additional_data['raw_heatmap_df'], 
+            additional_data['raw_heatmap_df'],
             selected_data,
             additional_data['dataset'], component
         )
+    elif component == 'Tag Table':
+        # flashtnt-only panel: tags are scan (spectrum) data. Scope to the
+        # selected proteoform's scan and stamp ProteinIndex so the frontend's
+        # tag.ProteinIndex===selectedProteinIndex filter passes all the scan's
+        # tags through to the table and the on-spectrum overlay.
+        scan_map = additional_data.get('proteoform_scan_map', {})
+        entry = scan_map.get(selection_store.get('proteinIndex'))
+        if entry is None:
+            data['tag_table'] = data['tag_table'].iloc[0:0, :]
+        else:
+            sel = data['tag_table'][data['tag_table']['Scan'] == entry['scan']].copy()
+            sel['ProteinIndex'] = selection_store['proteinIndex']
+            data['tag_table'] = sel
 
     if (
-        (component in ['Internal Fragment Map', 'Sequence View']) 
+        (component in ['Internal Fragment Map', 'Sequence View'])
         and (tool == 'flashtnt')
     ):
         if 'proteinIndex' not in selection_store:
             data['sequence_data'] = {}
         else:
-            data['sequence_data'] = {
-                selection_store['proteinIndex'] : data[
-                    'sequence_data'
-                ][selection_store['proteinIndex']]
-            }
+            pid = selection_store['proteinIndex']
+            entry = load_entry(additional_data['sequence_data_ds'], pid)
+            data['sequence_data'] = {pid: entry} if entry is not None else {}
 
     if (component == 'Internal Fragment Map') and (tool == 'flashtnt'):
         if 'proteinIndex' not in selection_store: