Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 41 additions & 0 deletions src/parse/tag_resolution.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import pandas as pd


def _split_ints(s):
return [int(x) for x in str(s).replace(';', ' ').split()
if x.strip().lstrip('-').isdigit()]


def build_tagspace_to_proteoform_map(raw_tag_df, raw_protein_df):
"""Map FLASHTagger tag-space ProteoformIndex -> protein-space index.

tags.tsv enumerates all candidate proteoforms (tag-space, incl. decoys);
protein.tsv enumerates surviving proteoforms (protein-space). The bridge is
protein.tsv.TagIndices crossed with tags.tsv.ProteoformIndex; the relation
is a strictly monotonic bijection, resolved by greedy monotonic assignment
over proteoforms in ascending protein-space order. Returns
{tag_space_index: protein_space_index}; tag-space indices with no surviving
proteoform are absent (callers map them to -1).
"""
ti_to_qset = {
int(ti): set(_split_ints(pis))
for ti, pis in zip(raw_tag_df['TagIndex'], raw_tag_df['ProteoformIndex'])
}
q_to_p = {}
prev_q = -1
ordered = raw_protein_df.sort_values('ProteoformIndex')
for p, tagidx in zip(ordered['ProteoformIndex'].astype(int), ordered['TagIndices']):
cand = None
for t in _split_ints(tagidx):
s = ti_to_qset.get(t, set())
cand = s if cand is None else (cand & s)
if not cand: # empty intersection -> union fallback
cand = set()
for t in _split_ints(tagidx):
cand |= ti_to_qset.get(t, set())
nxt = [q for q in sorted(cand) if q > prev_q]
if not nxt:
continue
q_to_p[nxt[0]] = int(p)
prev_q = nxt[0]
return q_to_p
18 changes: 15 additions & 3 deletions src/parse/tnt.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,15 @@

import numpy as np
import pandas as pd
import pyarrow.parquet as pq
from src.render.sequence_data_store import build_table, ROW_GROUP_SIZE

from io import StringIO
from pyopenms import AASequence
from scipy.stats import gaussian_kde

from src.parse.masstable import parseFLASHTaggerOutput
from src.parse.tag_resolution import build_tagspace_to_proteoform_map
from src.render.sequence import (
remove_ambigious, getFragmentDataFromSeq, getInternalFragmentDataFromSeq
)
Expand Down Expand Up @@ -66,6 +69,9 @@ def parseTnT(file_manager, dataset_id, deconv_mzML, anno_mzML, tag_tsv, protein_

tolerance = file_manager.get_results(dataset_id, ['deconv_tolerance'])['deconv_tolerance']
tag_df, protein_df = parseFLASHTaggerOutput(tag_tsv, protein_tsv)
# Map FLASHTagger tag-space ProteoformIndex -> protein-space index from the
# raw frames (before protein_df is renamed and tag_df is linearized).
tagspace_to_proteoform = build_tagspace_to_proteoform_map(tag_df, protein_df)
logger.log("10.0 %", level=2)

# protein_table
Expand Down Expand Up @@ -107,10 +113,14 @@ def parseTnT(file_manager, dataset_id, deconv_mzML, anno_mzML, tag_tsv, protein_
sequence_data = {}
# internal_fragment_data = {} # Disabled
# Compute coverage
# Group tag ranges by proteoform once (StartPos/EndPos already shifted above).
# tag_df['ProteinIndex'] is tag-space; map to protein-space so coverage uses
# each proteoform's own tags (the two enumerations diverge on large runs).
proteoform_of_tag = tag_df['ProteinIndex'].map(
lambda q: tagspace_to_proteoform.get(int(q), -1) if pd.notna(q) else -1
)
tag_groups = {
pid: (g['StartPos'].to_numpy(), g['EndPos'].to_numpy())
for pid, g in tag_df.groupby('ProteinIndex')[['StartPos', 'EndPos']]
for pid, g in tag_df.groupby(proteoform_of_tag)[['StartPos', 'EndPos']]
}
for i, row in protein_df.iterrows():
pid = row['index']
Expand Down Expand Up @@ -173,7 +183,9 @@ def parseTnT(file_manager, dataset_id, deconv_mzML, anno_mzML, tag_tsv, protein_
# str(sequence)[start_index:end_index+1], modifications
# ) # Disabled

file_manager.store_data(dataset_id, 'sequence_data', sequence_data)
sequence_data_table = build_table(sequence_data)
with file_manager.parquet_sink(dataset_id, 'sequence_data') as sequence_data_path:
pq.write_table(sequence_data_table, sequence_data_path, row_group_size=ROW_GROUP_SIZE)
# file_manager.store_data(
# dataset_id, 'internal_fragment_data', internal_fragment_data
# ) # Disabled
Expand Down
51 changes: 37 additions & 14 deletions src/render/initialize.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,29 @@
FDRPlotly, FLASHQuant
)
from src.render.compression import compute_compression_levels
from src.render.scan_resolution import build_proteoform_scan_map


def _attach_proteoform_scan_map(file_manager, selected_data, additional_data):
protein_df = file_manager.get_results(selected_data, ['protein_dfs'])['protein_dfs']
scan_table_df = file_manager.get_results(selected_data, ['scan_table'])['scan_table']
additional_data['proteoform_scan_map'] = build_proteoform_scan_map(
protein_df[['index', 'Scan']], scan_table_df[['index', 'Scan']]
)


def _load_scan_scoped(file_manager, selected_data, cache_name, tool, additional_data):
"""Eager-load the cache once (cached in session_state by the caller). For
flashtnt also attach the proteoform->scan map so filter_data can slice the
selected proteoform's scan in memory -- matching the FLASHDeconv path, which
loads once and slices with iloc. (A per-click pyarrow pushdown was tried but
re-read the whole file every click: the per-scan caches are written as a
single parquet row group, so pushdown cannot skip rows.)"""
result = file_manager.get_results(selected_data, [cache_name])
if tool == 'flashtnt':
_attach_proteoform_scan_map(file_manager, selected_data, additional_data)
return result[cache_name]


def initialize_data(comp_name, selected_data, file_manager, tool):

Expand Down Expand Up @@ -108,31 +131,31 @@ def initialize_data(comp_name, selected_data, file_manager, tool):
data_to_send['per_scan_data'] = data['scan_table']
component_arguments = Tabulator('ScanTable')
elif comp_name == 'deconv_spectrum':
data = file_manager.get_results(selected_data, ['deconv_spectrum'])
data_to_send['per_scan_data'] = data['deconv_spectrum']
data_to_send['per_scan_data'] = _load_scan_scoped(
file_manager, selected_data, 'deconv_spectrum', tool, additional_data)
component_arguments = PlotlyLineplot(title="Deconvolved Spectrum")
elif comp_name == 'combined_spectrum':
data = file_manager.get_results(selected_data, ['combined_spectrum'])
data_to_send['per_scan_data'] = data['combined_spectrum']
data_to_send['per_scan_data'] = _load_scan_scoped(
file_manager, selected_data, 'combined_spectrum', tool, additional_data)
component_arguments = PlotlyLineplotTagger(title="Augmented Deconvolved Spectrum")
elif comp_name == 'anno_spectrum':
data = file_manager.get_results(selected_data, ['combined_spectrum'])
data_to_send['per_scan_data'] = data['combined_spectrum']
data_to_send['per_scan_data'] = _load_scan_scoped(
file_manager, selected_data, 'combined_spectrum', tool, additional_data)
component_arguments = PlotlyLineplot(title="Annotated Spectrum")
elif comp_name == 'mass_table':
data = file_manager.get_results(selected_data, ['mass_table'])
data_to_send['per_scan_data'] = data['mass_table']
data_to_send['per_scan_data'] = _load_scan_scoped(
file_manager, selected_data, 'mass_table', tool, additional_data)
component_arguments = Tabulator('MassTable')
elif comp_name == '3D_SN_plot':
data = file_manager.get_results(selected_data, ['threedim_SN_plot'], use_pyarrow=True)
data_to_send['per_scan_data'] = data['threedim_SN_plot']
component_arguments = Plotly3Dplot(title="Precursor Signals")
elif comp_name == 'sequence_view':
data = file_manager.get_results(selected_data, ['sequence_view'])
data_to_send['per_scan_data'] = data['sequence_view']
data_to_send['per_scan_data'] = _load_scan_scoped(
file_manager, selected_data, 'sequence_view', tool, additional_data)
if tool == 'flashtnt':
data = file_manager.get_results(selected_data, ['sequence_data'])
data_to_send['sequence_data'] = data['sequence_data']
seq = file_manager.get_results(selected_data, ['sequence_data'], use_pyarrow=True)
additional_data['sequence_data_ds'] = seq['sequence_data']
data = file_manager.get_results(selected_data, ['settings'])
data_to_send['settings'] = data['settings']
component_arguments = SequenceView(title='Sequence View')
Expand Down Expand Up @@ -165,8 +188,8 @@ def initialize_data(comp_name, selected_data, file_manager, tool):
data_to_send['protein_table'] = data['protein_dfs']
component_arguments = Tabulator('ProteinTable')
elif comp_name == 'tag_table':
data = file_manager.get_results(selected_data, ['tag_dfs'])
data_to_send['tag_table'] = data['tag_dfs']
data_to_send['tag_table'] = _load_scan_scoped(
file_manager, selected_data, 'tag_dfs', tool, additional_data)
component_arguments = Tabulator('TagTable')
elif comp_name == 'quant_visualization':
data = file_manager.get_results(selected_data, ['quant_dfs'])
Expand Down
27 changes: 27 additions & 0 deletions src/render/scan_resolution.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import pandas as pd


def build_proteoform_scan_map(protein_df, scan_table_df):
"""Map each proteoform index to its scan and the deconv row index.

protein_df: DataFrame with 'index' (proteoform index) and 'Scan'.
scan_table_df: DataFrame with 'index' (deconv row index) and 'Scan'.

Returns {proteoform_index: {'scan': int, 'deconv_index': int}}.
Proteoforms whose Scan is NaN or absent from scan_table are omitted.
"""
scan_to_index = (
scan_table_df.drop_duplicates(subset="Scan", keep="first")
.set_index("Scan")["index"]
)
result = {}
for proteoform_index, scan in zip(protein_df["index"], protein_df["Scan"]):
if pd.isna(scan):
continue
scan = int(scan)
if scan in scan_to_index.index:
result[int(proteoform_index)] = {
"scan": scan,
"deconv_index": int(scan_to_index.loc[scan]),
}
return result
90 changes: 90 additions & 0 deletions src/render/sequence_data_store.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import pyarrow as pa
import pyarrow.dataset as ds
import pyarrow.parquet as pq

# One row per proteoform. Explicit schema so the always-empty
# fixed_modifications and the empty/variable modifications get consistent types.
SCHEMA = pa.schema([
("proteoform_index", pa.int64()),
("sequence", pa.list_(pa.string())),
("theoretical_mass", pa.float64()),
("fixed_modifications", pa.list_(pa.string())),
("coverage", pa.list_(pa.float64())),
("maxCoverage", pa.float64()),
("fragment_masses_a", pa.list_(pa.list_(pa.float64()))),
("fragment_masses_b", pa.list_(pa.list_(pa.float64()))),
("fragment_masses_c", pa.list_(pa.list_(pa.float64()))),
("fragment_masses_x", pa.list_(pa.list_(pa.float64()))),
("fragment_masses_y", pa.list_(pa.list_(pa.float64()))),
("fragment_masses_z", pa.list_(pa.list_(pa.float64()))),
("proteoform_start", pa.int64()),
("proteoform_end", pa.int64()),
("computed_mass", pa.float64()),
("modifications", pa.list_(pa.struct([
("start", pa.int64()), ("end", pa.int64()),
("mass_diff", pa.float64()), ("labels", pa.string()),
]))),
])

ROW_GROUP_SIZE = 64
ENTRY_KEYS = [f.name for f in SCHEMA if f.name != "proteoform_index"]


def _py(x):
"""Recursively convert numpy scalars to builtins so pa.Table.from_pylist
serializes cleanly (coverage/maxCoverage are np.float64)."""
import numpy as np
if isinstance(x, np.generic):
return x.item()
if isinstance(x, list):
return [_py(v) for v in x]
if isinstance(x, dict):
return {k: _py(v) for k, v in x.items()}
return x


def build_table(sequence_data):
"""{proteoform_index: entry} -> pyarrow Table, one row per proteoform,
sorted by proteoform_index (so row groups carry contiguous index ranges
and pushdown can skip)."""
rows = []
for pid in sorted(sequence_data):
entry = sequence_data[pid]
row = {"proteoform_index": int(pid)}
for k in ENTRY_KEYS:
row[k] = _py(entry[k])
rows.append(row)
return pa.Table.from_pylist(rows, schema=SCHEMA)


def _as_dataset(dataset_or_path):
if isinstance(dataset_or_path, ds.Dataset):
return dataset_or_path
return ds.dataset(str(dataset_or_path), format="parquet")


def load_entry(dataset_or_path, proteoform_index):
"""Pushdown-read one proteoform's row; return its entry dict (native Python
containers via to_pylist) with proteoform_index removed, or None if absent."""
dataset = _as_dataset(dataset_or_path)
table = dataset.to_table(filter=ds.field("proteoform_index") == int(proteoform_index))
rows = table.to_pylist()
if not rows:
return None
entry = rows[0]
entry.pop("proteoform_index", None)
return entry


def reconstruct_all(dataset_or_path):
"""Read every row -> {proteoform_index: entry}. For migration verification
and the golden adapter only; never the hot render path."""
if isinstance(dataset_or_path, ds.Dataset):
table = dataset_or_path.to_table()
else:
table = pq.read_table(str(dataset_or_path))
out = {}
for row in table.to_pylist():
pid = row.pop("proteoform_index")
out[pid] = row
return out
36 changes: 28 additions & 8 deletions src/render/update.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from src.workflow.FileManager import FileManager
from src.render.sequence import getFragmentDataFromSeq, getInternalFragmentDataFromSeq
from pathlib import Path
from src.render.sequence_data_store import load_entry


def get_sequence(selection_store):
Expand Down Expand Up @@ -117,7 +118,15 @@ def filter_data(data, out_components, selection_store, additional_data, tool):
'Augmented Deconvolved Spectrum',
'Mass Table', 'Sequence View', 'Internal Fragment Map'
]:
if 'scanIndex' not in selection_store:
if tool == 'flashtnt':
scan_map = additional_data.get('proteoform_scan_map', {})
entry = scan_map.get(selection_store.get('proteinIndex'))
if entry is None:
data['per_scan_data'] = data['per_scan_data'].iloc[0:0, :]
else:
per_scan = data['per_scan_data']
data['per_scan_data'] = per_scan[per_scan['index'] == entry['deconv_index']]
elif 'scanIndex' not in selection_store:
data['per_scan_data'] = data['per_scan_data'].iloc[0:0,:]
else:
data['per_scan_data'] = data['per_scan_data'].iloc[selection_store['scanIndex']:selection_store['scanIndex']+1,:]
Expand Down Expand Up @@ -159,23 +168,34 @@ def filter_data(data, out_components, selection_store, additional_data, tool):
else:
selected_data = selection_store[selection]
data['raw_heatmap_df'] = render_heatmap(
additional_data['raw_heatmap_df'],
additional_data['raw_heatmap_df'],
selected_data,
additional_data['dataset'], component
)
elif component == 'Tag Table':
# flashtnt-only panel: tags are scan (spectrum) data. Scope to the
# selected proteoform's scan and stamp ProteinIndex so the frontend's
# tag.ProteinIndex===selectedProteinIndex filter passes all the scan's
# tags through to the table and the on-spectrum overlay.
scan_map = additional_data.get('proteoform_scan_map', {})
entry = scan_map.get(selection_store.get('proteinIndex'))
if entry is None:
data['tag_table'] = data['tag_table'].iloc[0:0, :]
else:
sel = data['tag_table'][data['tag_table']['Scan'] == entry['scan']].copy()
sel['ProteinIndex'] = selection_store['proteinIndex']
data['tag_table'] = sel

if (
(component in ['Internal Fragment Map', 'Sequence View'])
(component in ['Internal Fragment Map', 'Sequence View'])
and (tool == 'flashtnt')
):
if 'proteinIndex' not in selection_store:
data['sequence_data'] = {}
else:
data['sequence_data'] = {
selection_store['proteinIndex'] : data[
'sequence_data'
][selection_store['proteinIndex']]
}
pid = selection_store['proteinIndex']
entry = load_entry(additional_data['sequence_data_ds'], pid)
data['sequence_data'] = {pid: entry} if entry is not None else {}

if (component == 'Internal Fragment Map') and (tool == 'flashtnt'):
if 'proteinIndex' not in selection_store:
Expand Down
Loading