Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -118,8 +118,10 @@
defaults_.setMinFloat("add_mass_offset_peptides", 0.0);

// available scores: initialPeakQuality,total_xic,peak_apices_sum,var_xcorr_coelution,var_xcorr_coelution_weighted,var_xcorr_shape,var_xcorr_shape_weighted,var_library_corr,var_library_rmsd,var_library_sangle,var_library_rootmeansquare,var_library_manhattan,var_library_dotprod,var_intensity_score,nr_peaks,sn_ratio,var_log_sn_score,var_elution_model_fit_score,xx_lda_prelim_score,var_isotope_correlation_score,var_isotope_overlap_score,var_massdev_score,var_massdev_score_weighted,var_bseries_score,var_yseries_score,var_dotprod_score,var_manhatt_score,main_var_xx_swath_prelim_score,xx_swath_prelim_score
// exclude some redundant/uninformative scores:
// @TODO: intensity bias introduced by "peak_apices_sum"?
// TODO: evaluate and exclude some redundant/uninformative scores:
// - intensity bias introduced by "peak_apices_sum"?
// - xx_lda_prelim_score is already a lin. comb. of other scores
// - main_var_xx_swath_prelim_score is potentially the same as xx_lda_prelim_score
// names of scores to use as SVM features
String score_metavalues = "peak_apices_sum,var_xcorr_coelution,var_xcorr_shape,var_library_sangle,var_intensity_score,sn_ratio,var_log_sn_score,var_elution_model_fit_score,xx_lda_prelim_score,var_ms1_isotope_correlation_score,var_ms1_isotope_overlap_score,var_massdev_score,main_var_xx_swath_prelim_score";

Expand Down Expand Up @@ -363,319 +365,349 @@
return seeds_added;
}

void FeatureFinderIdentificationAlgorithm::run(
vector<PeptideIdentification> peptides,
const vector<ProteinIdentification>& proteins,
vector<PeptideIdentification> peptides_ext,
vector<ProteinIdentification> proteins_ext,
FeatureMap& features,
const FeatureMap& seeds,
const String& spectra_file
)
{
if ((svm_n_samples_ > 0) && (svm_n_samples_ < 2 * svm_n_parts_))
{
String msg = "Sample size of " + String(svm_n_samples_) +
" (parameter 'svm:samples') is not enough for " + String(svm_n_parts_) +
"-fold cross-validation (parameter 'svm:xval').";
throw Exception::InvalidParameter(__FILE__, __LINE__,
OPENMS_PRETTY_FUNCTION, msg);
}

// annotate mzML file
features.setPrimaryMSRunPath({spectra_file}, ms_data_);

// initialize algorithm classes needed later:
Param params = feat_finder_.getParameters();
params.setValue("stop_report_after_feature", -1); // return all features
params.setValue("EMGScoring:max_iteration", param_.getValue("EMGScoring:max_iteration"));
params.setValue("EMGScoring:init_mom", param_.getValue("EMGScoring:init_mom"));
params.setValue("Scores:use_rt_score", "false"); // RT may not be reliable
params.setValue("Scores:use_ionseries_scores", "false"); // since FFID only uses MS1 spectra, this is useless
params.setValue("Scores:use_ms2_isotope_scores", "false"); // since FFID only uses MS1 spectra, this is useless
params.setValue("Scores:use_ms1_correlation", "false"); // this would be redundant to the "MS2" correlation and since
// precursor transition = first product transition, additionally biased
params.setValue("Scores:use_ms1_mi", "false"); // same as above. On MS1 level we basically only care about the "MS1 fullscan" scores
//TODO for MS1 level scoring there is an additional parameter add_up_spectra with which we can add up spectra
// around the apex, to complete isotopic envelopes (and therefore make this score more robust).

if (peptides_ext.empty()) // SVM scores not needed, disable the most expensive ones
{
// TODO this score also affects Swath LDA prescoring. I wonder if/how this impacts the
// creation/addition of a feature.
params.setValue("Scores:use_elution_model_score", "false");
}

if ((elution_model_ != "none") || (!candidates_out_.empty()))
{
params.setValue("write_convex_hull", "true");
}
if (min_peak_width_ < 1.0)
{
min_peak_width_ *= peak_width_;
}

// TODO I wonder if the following parameters would be enough.
// In theory we only care for one feature per one set of extracted chromatograms (transition group)
//params.setValue("stop_report_after_feature", 1); // best by quality, after scoring
//params.setValue("TransitionGroupPicker:stop_after_feature", 1); // best by intensity, after picking, before scoring
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm I did not look into it but if we have one feature with smaller intensity but much less RT error it could get lost. @hendrikweisser do you recall?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Having only one feature candidate for sure wouldn't work with the SVM-based rescoring/FDR estimation approach in FFId. If you're not using this functionality you could only extract one feature, but I agree with Timo that RT deviation is often the most important criterion. Certainly if you detect features in the same file where the peptide IDs were generated - then the feature candidate overlapping the ID is always assumed to be the correct one. (So a nice optimisation may be to detect a single peak/feature starting at the ID position and moving outward.)


params.setValue("TransitionGroupPicker:PeakPickerChromatogram:gauss_width",
peak_width_);
params.setValue("TransitionGroupPicker:min_peak_width", min_peak_width_);
// disabling the signal-to-noise threshold (setting the parameter to zero)
// totally breaks the OpenSWATH feature detection (no features found)!
params.setValue("TransitionGroupPicker:PeakPickerChromatogram:signal_to_noise",
signal_to_noise_);
params.setValue("TransitionGroupPicker:recalculate_peaks", "true");
params.setValue("TransitionGroupPicker:PeakPickerChromatogram:peak_width", -1.0);
params.setValue("TransitionGroupPicker:PeakPickerChromatogram:method",
"corrected");
params.setValue("TransitionGroupPicker:PeakPickerChromatogram:method", "corrected"); // default

// ======================================================================================================================================
// TODO: The following group of parameters probably only have an effect if ElutionModelFitter at the end of FFID
// (in post_process_) is deactivated with 'elution_model = "none"'.
// Otherwise ElutionModelFitter fits a new model (EGH instead of EMG) that takes into account all traces per feature
// at the same time and overwrites intensities. "Old" SWATH intensities might be available in the feature as metavalue
// "raw_intensity" though.

// Note: this EMG fitting only affects integration, not scoring
params.setValue("TransitionGroupPicker:PeakIntegrator:fit_EMG", "false"); // default (= disabled)
// Note: Fitting for scoring is controlled via EMGScoring parameter group. Fitting for integration here uses a completely
// different algorithm (EmgGradientDescent class; params not exposed) with gradient descent instead of Eigen::LevenbergMarquardt algorithm (EmgFitter1D class).
// TODO: evaluate difference in runtimes and quality of result
params.setValue("TransitionGroupPicker:PeakIntegrator:integration_type", "intensity_sum"); // default
params.setValue("TransitionGroupPicker:background_subtraction", "exact"); // enable (default was "none")
params.setValue("TransitionGroupPicker:PeakIntegrator:baseline_type", "base_to_base"); // default
// ======================================================================================================================================

params.setValue("TransitionGroupPicker:PeakPickerChromatogram:write_sn_log_messages", "false"); // disabled in OpenSWATH

feat_finder_.setParameters(params);
feat_finder_.setLogType(ProgressLogger::NONE);
feat_finder_.setStrictFlag(false);
// to use MS1 Swath scores:
feat_finder_.setMS1Map(SimpleOpenMSSpectraFactory::getSpectrumAccessOpenMSPtr(boost::make_shared<MSExperiment>(ms_data_)));

double rt_uncertainty(0);
bool with_external_ids = !peptides_ext.empty();

if (with_external_ids && !seeds.empty())
{
throw Exception::IllegalArgument(
__FILE__,
__LINE__,
OPENMS_PRETTY_FUNCTION,
"Using seeds and external ids is currently not supported.");
}

if (with_external_ids)
{
// align internal and external IDs to estimate RT shifts:
MapAlignmentAlgorithmIdentification aligner;
aligner.setReference(peptides_ext); // go from internal to external scale
vector<vector<PeptideIdentification> > aligner_peptides(1, peptides);
vector<TransformationDescription> aligner_trafos;

OPENMS_LOG_INFO << "Realigning internal and external IDs...";
aligner.align(aligner_peptides, aligner_trafos);
trafo_external_ = aligner_trafos[0];
vector<double> aligned_diffs;
trafo_external_.getDeviations(aligned_diffs);
Size index = std::max(Size(0), Size(rt_quantile_ * static_cast<double>(aligned_diffs.size())) - 1);
rt_uncertainty = aligned_diffs[index];
try
{
aligner_trafos[0].fitModel("lowess");
trafo_external_ = aligner_trafos[0];
}
catch (Exception::BaseException& e)
{
OPENMS_LOG_ERROR << "Error: Failed to align RTs of internal/external peptides. RT information will not be considered in the SVM classification. The original error message was:\n" << e.what() << endl;
}
}

if (rt_window_ == 0.0)
{
// calculate RT window based on other parameters and alignment quality:
double map_tol = mapping_tolerance_;
if (map_tol < 1.0)
{
map_tol *= (2 * peak_width_); // relative tolerance
}
rt_window_ = (rt_uncertainty + 2 * peak_width_ + map_tol) * 2;
OPENMS_LOG_INFO << "RT window size calculated as " << rt_window_ << " seconds."
<< endl;
}

//-------------------------------------------------------------
// prepare peptide map
//-------------------------------------------------------------
OPENMS_LOG_INFO << "Preparing mapping of peptide data..." << endl;
peptide_map_.clear();

// Reserve enough space for all possible seeds
{
Size max_size = peptides.size() + seeds.size();
if (add_mass_offset_peptides_)
{
max_size *= 2;
}
peptides.reserve(max_size);
}

for (auto& pep : peptides)
{
addPeptideToMap_(pep, peptide_map_);
pep.setMetaValue("FFId_category", "internal");
}

// TODO make sure that only assembled traces (more than one trace -> has a charge) if FFMetabo is used
// see FeatureFindingMetabo: defaults_.setValue("remove_single_traces", "false", "Remove unassembled traces (single traces).");
Size seeds_added = addSeeds_(peptides, seeds);
OPENMS_LOG_INFO << "#Seeds without RT and m/z overlap with identified peptides added: " << seeds_added << endl;

if (add_mass_offset_peptides_ > 0.0)
{
Size n_added = addOffsetPeptides_(peptides, add_mass_offset_peptides_);
OPENMS_LOG_INFO << "#Offset peptides without RT and m/z overlap with other peptides added: " << n_added << endl;
}

n_internal_peps_ = peptide_map_.size();
for (PeptideIdentification& pep : peptides_ext)
{
addPeptideToMap_(pep, peptide_map_, true);
pep.setMetaValue("FFId_category", "external");
}
n_external_peps_ = peptide_map_.size() - n_internal_peps_;

boost::shared_ptr<PeakMap> shared = boost::make_shared<PeakMap>(ms_data_);
OpenSwath::SpectrumAccessPtr spec_temp =
SimpleOpenMSSpectraFactory::getSpectrumAccessOpenMSPtr(shared);
auto chunks = chunk_(peptide_map_.begin(), peptide_map_.end(), batch_size_);

PeptideRefRTMap ref_rt_map;
if (debug_level_ >= 668)
{
OPENMS_LOG_INFO << "Creating full assay library for debugging." << endl;
// Warning: this step is pretty inefficient, since it does the whole library generation twice
// Really use for debug only
createAssayLibrary_(peptide_map_.begin(), peptide_map_.end(), ref_rt_map, false);
cout << "Writing debug.traml file." << endl;
FileHandler().storeTransitions("debug.traml", library_);
ref_rt_map.clear();
library_.clear(true);
}

//-------------------------------------------------------------
// run feature detection
//-------------------------------------------------------------
//Note: progress only works in non-debug when no logs come in-between
getProgressLogger().startProgress(0, chunks.size(), "Creating assay library and extracting chromatograms");
Size chunk_count = 0;
for (auto& chunk : chunks)
{
//TODO since ref_rt_map is only used after chunking, we could create
// maps per chunk and merge them in the end. Would help in parallelizing as well.
createAssayLibrary_(chunk.first, chunk.second, ref_rt_map);
OPENMS_LOG_DEBUG << "#Transitions: " << library_.getTransitions().size() << endl;

ChromatogramExtractor extractor;
// extractor.setLogType(ProgressLogger::NONE);
{
vector<OpenSwath::ChromatogramPtr> chrom_temp;
vector<ChromatogramExtractor::ExtractionCoordinates> coords;
// take entries in library_ and put to chrom_temp and coords
extractor.prepare_coordinates(chrom_temp, coords, library_,
numeric_limits<double>::quiet_NaN(), false);


extractor.extractChromatograms(spec_temp, chrom_temp, coords, mz_window_,
mz_window_ppm_, "tophat");
extractor.return_chromatogram(chrom_temp, coords, library_, (*shared)[0],
chrom_data_.getChromatograms(), false);
}

OPENMS_LOG_DEBUG << "Extracted " << chrom_data_.getNrChromatograms()
<< " chromatogram(s)." << endl;

OPENMS_LOG_DEBUG << "Detecting chromatographic peaks..." << endl;
// suppress status output from OpenSWATH, unless in debug mode:
if (debug_level_ < 1)
{
OpenMS_Log_info.remove(cout);
}
feat_finder_.pickExperiment(chrom_data_, features, library_,
TransformationDescription(), ms_data_);
if (debug_level_ < 1)
{
OpenMS_Log_info.insert(cout); // revert logging change
}
chrom_data_.clear(true);
library_.clear(true);
// since chrom_data_ here is just a container for the chromatograms and identifications will be empty,
// pickExperiment above will only add empty ProteinIdentification runs with colliding identifiers.
// Usually we could sanitize the identifiers or merge the runs, but since they are empty and we add the
// "real" proteins later -> just clear them
features.getProteinIdentifications().clear();
getProgressLogger().setProgress(++chunk_count);
}
getProgressLogger().endProgress();

OPENMS_LOG_INFO << "Found " << features.size() << " feature candidates in total."
<< endl;

ms_data_.reset(); // not needed anymore, free up the memory
// complete feature annotation:
annotateFeatures_(features, ref_rt_map);

// sort everything:
sort(features.getUnassignedPeptideIdentifications().begin(),
features.getUnassignedPeptideIdentifications().end(),
peptide_compare_);
sort(features.begin(), features.end(), feature_compare_);

postProcess_(features, with_external_ids);
statistics_(features);

features.setProteinIdentifications(proteins);
// add external IDs (if any):
features.getProteinIdentifications().insert(
features.getProteinIdentifications().end(), proteins_ext.begin(),
proteins_ext.end());
features.getUnassignedPeptideIdentifications().insert(
features.getUnassignedPeptideIdentifications().end(),
peptides_ext.begin(), peptides_ext.end());

// remove all hits with pseudo ids (seeds)
for (Feature& f : features)
{
std::vector<PeptideIdentification>& ids = f.getPeptideIdentifications();

// if we have peptide identifications assigned and all are annotated as OffsetPeptide, we mark the feature is also an OffsetPeptide
if (!ids.empty() && std::all_of(ids.begin(), ids.end(), [](const PeptideIdentification & pid) { return pid.metaValueExists("OffsetPeptide"); }))
{
f.setMetaValue("OffsetPeptide", "true");
}

// remove all hits (PSM details)
for (auto & pid : ids)
{
std::vector<PeptideHit>& hits = pid.getHits();
auto it = remove_if(hits.begin(), hits.end(),
[](const PeptideHit & ph)
{
return (ph.getSequence().toUnmodifiedString().hasPrefix("XXX"));
});
hits.erase(it, hits.end()); // remove / erase idiom
}

// remove empty PeptideIdentifications
auto it = remove_if(ids.begin(), ids.end(),
[](const PeptideIdentification & pid)
{
return pid.empty();
});
ids.erase(it, ids.end()); // remove / erase idiom
}

// clean up unassigned PeptideIdentifications
std::vector<PeptideIdentification>& ids = features.getUnassignedPeptideIdentifications();
for (auto & pid : ids)
{
std::vector<PeptideHit>& hits = pid.getHits();
auto it = remove_if(hits.begin(), hits.end(), [](const PeptideHit & ph)
{
return (ph.getSequence().toUnmodifiedString().hasPrefix("XXX"));
});
hits.erase(it, hits.end());
}

// remove empty PeptideIdentifications
auto it = remove_if(ids.begin(), ids.end(),
[](const PeptideIdentification & pid)
{
return pid.empty();
});
ids.erase(it, ids.end()); // remove / erase idiom

// add back ignored PSMs
features.getUnassignedPeptideIdentifications().insert(features.getUnassignedPeptideIdentifications().end(),
std::move_iterator(unassignedIDs_.begin()),
std::move_iterator(unassignedIDs_.end()));

features.ensureUniqueId();
}

Check notice on line 710 in src/openms/source/TRANSFORMATIONS/FEATUREFINDER/FeatureFinderIdentificationAlgorithm.cpp

View check run for this annotation

codefactor.io / CodeFactor

src/openms/source/TRANSFORMATIONS/FEATUREFINDER/FeatureFinderIdentificationAlgorithm.cpp#L368-L710

Complex Method
void FeatureFinderIdentificationAlgorithm::postProcess_(
FeatureMap & features,
bool with_external_ids)
Expand Down Expand Up @@ -727,7 +759,7 @@
}
}
}

Check notice on line 762 in src/openms/source/TRANSFORMATIONS/FEATUREFINDER/FeatureFinderIdentificationAlgorithm.cpp

View check run for this annotation

codefactor.io / CodeFactor

src/openms/source/TRANSFORMATIONS/FEATUREFINDER/FeatureFinderIdentificationAlgorithm.cpp#L762

Redundant blank line at the end of a code block should be deleted. (whitespace/blank_line)
}

void FeatureFinderIdentificationAlgorithm::runOnCandidates(FeatureMap & features)
Expand Down Expand Up @@ -781,7 +813,7 @@
postProcess_(features, with_external_ids);

statistics_(features);

Check notice on line 816 in src/openms/source/TRANSFORMATIONS/FEATUREFINDER/FeatureFinderIdentificationAlgorithm.cpp

View check run for this annotation

codefactor.io / CodeFactor

src/openms/source/TRANSFORMATIONS/FEATUREFINDER/FeatureFinderIdentificationAlgorithm.cpp#L816

Redundant blank line at the end of a code block should be deleted. (whitespace/blank_line)
}

void FeatureFinderIdentificationAlgorithm::statistics_(FeatureMap const & features) const
Expand Down Expand Up @@ -821,184 +853,184 @@
<< " peptides without features ("
<< n_internal_peps_ - quantified_internal.size() << " internal, "
<< n_missing_external << " external)\n" << endl;

Check notice on line 856 in src/openms/source/TRANSFORMATIONS/FEATUREFINDER/FeatureFinderIdentificationAlgorithm.cpp

View check run for this annotation

codefactor.io / CodeFactor

src/openms/source/TRANSFORMATIONS/FEATUREFINDER/FeatureFinderIdentificationAlgorithm.cpp#L856

Redundant blank line at the end of a code block should be deleted. (whitespace/blank_line)
}

void FeatureFinderIdentificationAlgorithm::createAssayLibrary_(const PeptideMap::iterator& begin, const PeptideMap::iterator& end, PeptideRefRTMap& ref_rt_map, bool clear_IDs)
{
std::set<String> protein_accessions;

Size seedcount = 0;
for (auto pm_it = begin;
pm_it != end; ++pm_it)
{
TargetedExperiment::Peptide peptide;
const AASequence &seq = pm_it->first;


// @NOTE: Technically, "TargetedExperiment::Peptide" stores the unmodified
// sequence and the modifications separately. Unfortunately, creating the
// modifications vector is complex and there is currently no convenient
// conversion function (see "TargetedExperimentHelper::getAASequence" for
// the reverse conversion). However, "Peptide" is later converted to
// "OpenSwath::LightPeptide" anyway, and this is done via "AASequence"
// (see "OpenSwathDataAccessHelper::convertTargetedPeptide"). So for our
// purposes it works to just store the sequence including modifications in
// "Peptide".

// for now, seeds are stored in the same PeptideRefMap, all
// under the same fake sequence key entry
// TODO add own data structure for them
if (seq.toUnmodifiedString().hasPrefix("XXX")) // seed
{
// This will force the SWATH scores to consider it like an unidentified peptide and e.g. use averagine isotopes
peptide.sequence = "";
// we do not have to aggregate their retention times, therefore just
// iterate over the entries
const ChargeMap& cm = pm_it->second;
for (const auto& charge_rtmap : cm)
{
Int charge = charge_rtmap.first;
// only go through internals for seeds (->first). External seeds are not supported
for (const auto& rt_pep : charge_rtmap.second.first)
{
// since we don't know their IDs, seeds will all need a different grouplabel in SWATH
// to not be combined
seedcount++;

double mz = rt_pep.second->getMZ();
double rt = rt_pep.second->getRT();
String uid = rt_pep.second->getMetaValue("SeedFeatureID");

// UID should be enough, but let's add the seed count to be sure.
String peptide_id = peptide.sequence + "[" + uid + "][" + String(seedcount) + "]/" + String(charge);
peptide.setChargeState(charge);
peptide.id = peptide_id;
peptide.protein_refs = {"not_available"};
peptide.setPeptideGroupLabel(peptide_id);

//create an entry in the "output" ref_rt_map for internals
RTMap &internal_ids = ref_rt_map[peptide_id].first;

// get isotope distribution for peptide:
//TODO Why 10? Document constant?
Size n_isotopes = (isotope_pmin_ > 0.0) ? 10 : n_isotopes_;
CoarseIsotopePatternGenerator generator(n_isotopes);
IsotopeDistribution iso_dist = generator
.estimateFromPeptideWeight(mz * charge - charge * Constants::PROTON_MASS_U);
if (isotope_pmin_ > 0.0)
{
iso_dist.trimLeft(isotope_pmin_);
iso_dist.trimRight(isotope_pmin_);
iso_dist.renormalize();
}

double rt_tolerance = seed_rt_window_ / 2.0;

// store beginning and end of RT region: here we only need one entry
peptide.rts.clear();
addPeptideRT_(peptide, rt - rt_tolerance);
addPeptideRT_(peptide, rt + rt_tolerance);
library_.addPeptide(peptide);
generateTransitions_(peptide.id, mz, charge, iso_dist);
internal_ids.emplace(rt_pep);
}
}
}
else
{
peptide.sequence = seq.toString();
// keep track of protein accessions:
set<String> current_accessions;
// internal/external pair
const pair<RTMap, RTMap> &pair = pm_it->second.begin()->second;

// WARNING: This assumes that at least one hit is present.
const PeptideHit &hit = (pair.first.empty() ?
pair.second.begin()->second->getHits()[0] :
pair.first.begin()->second->getHits()[0]);
current_accessions = hit.extractProteinAccessionsSet();
protein_accessions.insert(current_accessions.begin(),
current_accessions.end());
// missing protein accession would crash OpenSWATH algorithms:
if (current_accessions.empty())
{
current_accessions.insert("not_available");
}

peptide.protein_refs = vector<String>(current_accessions.begin(),
current_accessions.end());
// get regions in which peptide eludes (ideally only one):
std::vector<RTRegion> rt_regions;
getRTRegions_(pm_it->second, rt_regions, clear_IDs);

// get isotope distribution for peptide:
Size n_isotopes = (isotope_pmin_ > 0.0) ? 10 : n_isotopes_;
IsotopeDistribution iso_dist =
seq.getFormula(Residue::Full, 0).getIsotopeDistribution(CoarseIsotopePatternGenerator(n_isotopes));
if (isotope_pmin_ > 0.0)
{
iso_dist.trimLeft(isotope_pmin_);
iso_dist.trimRight(isotope_pmin_);
iso_dist.renormalize();
}

// go through different charge states:
for (ChargeMap::const_iterator cm_it = pm_it->second.begin();
cm_it != pm_it->second.end(); ++cm_it)
{
Int charge = cm_it->first;

double mz = seq.getMZ(charge);
OPENMS_LOG_DEBUG << "\nPeptide " << peptide.sequence << "/" << charge << " (m/z: " << mz << "):" << endl;
peptide.setChargeState(charge);
String peptide_id = peptide.sequence + "/" + String(charge);

// we want to detect one feature per peptide and charge state - if there
// are multiple RT regions, group them together:
peptide.setPeptideGroupLabel(peptide_id);
peptide.rts.clear();
Size counter = 0;
// accumulate IDs over multiple regions:
RTMap &internal_ids = ref_rt_map[peptide_id].first;
RTMap &external_ids = ref_rt_map[peptide_id].second;
for (RTRegion& reg : rt_regions)
{
if (reg.ids.count(charge))
{
OPENMS_LOG_DEBUG_NOFILE << "Charge " << charge << ", Region# " << counter + 1 << " (RT: "
<< float(reg.start) << "-" << float(reg.end)
<< ", size " << float(reg.end - reg.start) << ")"
<< endl;

peptide.id = peptide_id;
if (rt_regions.size() > 1)
peptide.id += ":" + String(++counter);

// store beginning and end of RT region:
peptide.rts.clear();
addPeptideRT_(peptide, reg.start);
addPeptideRT_(peptide, reg.end);
library_.addPeptide(peptide);
generateTransitions_(peptide.id, mz, charge, iso_dist);
}
internal_ids.insert(reg.ids[charge].first.begin(),
reg.ids[charge].first.end());
external_ids.insert(reg.ids[charge].second.begin(),
reg.ids[charge].second.end());
}
}
}
}
// add proteins to library:
for (String const &acc : protein_accessions)
{
TargetedExperiment::Protein protein;
protein.id = acc;
library_.addProtein(protein);
}
}

Check notice on line 1033 in src/openms/source/TRANSFORMATIONS/FEATUREFINDER/FeatureFinderIdentificationAlgorithm.cpp

View check run for this annotation

codefactor.io / CodeFactor

src/openms/source/TRANSFORMATIONS/FEATUREFINDER/FeatureFinderIdentificationAlgorithm.cpp#L859-L1033

Complex Method
void FeatureFinderIdentificationAlgorithm::getRTRegions_(
ChargeMap& peptide_data,
std::vector<RTRegion>& rt_regions,
Expand Down Expand Up @@ -1179,188 +1211,188 @@
}

/// annotate identified features with m/z, isotope probabilities, etc.
void FeatureFinderIdentificationAlgorithm::annotateFeatures_(FeatureMap& features, PeptideRefRTMap& ref_rt_map)
{
String previous_ref, peptide_ref;
RTMap transformed_internal;
Size i = 0;
map<Size, vector<PeptideIdentification*> > feat_ids;
for (Feature& feat : features)
{
feat.setMZ(feat.getMetaValue("PrecursorMZ"));
feat.setCharge(feat.getPeptideIdentifications()[0].getHits()[0].
getCharge());
ensureConvexHulls_(feat);
// remove "fake" IDs generated by OpenSWATH (they would be removed with
// a warning when writing output, because of missing protein
// identification with corresponding identifier):
feat.getPeptideIdentifications().clear();
// annotate subordinates with theoretical isotope intensities:
for (Feature& sub : feat.getSubordinates())
{
String native_id = sub.getMetaValue("native_id");
sub.setMetaValue("isotope_probability", isotope_probs_[native_id]);
}

peptide_ref = feat.getMetaValue("PeptideRef");
// remove region number, if present:
Size pos_slash = peptide_ref.rfind('/');
Size pos_colon = peptide_ref.find(':', pos_slash + 2);
peptide_ref = peptide_ref.substr(0, pos_colon);

if (peptide_ref != previous_ref)
{
if (!previous_ref.empty())
{
annotateFeaturesFinalizeAssay_(
features, feat_ids, ref_rt_map[previous_ref].first);
}
previous_ref = peptide_ref;
}

RTMap& rt_internal = ref_rt_map[peptide_ref].first;
RTMap& rt_external = ref_rt_map[peptide_ref].second;

if (rt_internal.empty() && rt_external.empty())
{
OPENMS_LOG_DEBUG << "PeptideRefs in RTMap:" << endl;
for (const auto& rtm : ref_rt_map)
{
OPENMS_LOG_DEBUG << rtm.first << endl;
}

throw Exception::IllegalArgument(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, "RT internal and external are both empty for peptide '" + String(peptide_ref) + "' stored as '" + String(feat.getMetaValue("PeptideRef")) + "'.");
}

if (!rt_internal.empty()) // validate based on internal IDs
{
// map IDs to features (based on RT):
double rt_min = features[i].getMetaValue("leftWidth");
double rt_max = features[i].getMetaValue("rightWidth");
if (mapping_tolerance_ > 0.0)
{
double abs_tol = mapping_tolerance_;
if (abs_tol < 1.0)
{
abs_tol *= (rt_max - rt_min);
}
rt_min -= abs_tol;
rt_max += abs_tol;
}
RTMap::const_iterator lower = rt_internal.lower_bound(rt_min);
RTMap::const_iterator upper = rt_internal.upper_bound(rt_max);
int id_count = 0;
for (; lower != upper; ++lower)
{
feat_ids[i].push_back(lower->second);
++id_count;
}
// "total" only includes IDs from this RT region:
feat.setMetaValue("n_total_ids", rt_internal.size());
feat.setMetaValue("n_matching_ids", id_count);
if (id_count > 0) // matching IDs -> feature may be correct
{
feat.setMetaValue("feature_class", "ambiguous");
}
else // no matching IDs -> feature is wrong
{
feat.setMetaValue("feature_class", "negative");
}
}
else // only external IDs -> no validation possible
{
feat.setMetaValue("n_total_ids", 0);
feat.setMetaValue("n_matching_ids", -1);
feat.setMetaValue("feature_class", "unknown");
// add "dummy" peptide identification:
PeptideIdentification id = *(rt_external.begin()->second);
id.clearMetaInfo();
id.setMetaValue("FFId_category", "implied");
id.setRT(feat.getRT());
id.setMZ(feat.getMZ());
// only one peptide hit per ID - see function "addPeptideToMap_":
PeptideHit& hit = id.getHits()[0];
hit.clearMetaInfo();
hit.setScore(0.0);
feat.getPeptideIdentifications().push_back(id);
}

// distance from feature to closest peptide ID:
if (!trafo_external_.getDataPoints().empty())
{
// use external IDs if available, otherwise RT-transformed internal IDs
// (but only compute the transform if necessary, once per assay!):
if (rt_external.empty() && (transformed_internal.empty() ||
(peptide_ref != previous_ref)))
{
transformed_internal.clear();
for (RTMap::const_iterator it = rt_internal.begin();
it != rt_internal.end(); ++it)
{
double transformed_rt = trafo_external_.apply(it->first);
RTMap::value_type pair = make_pair(transformed_rt, it->second);
transformed_internal.insert(transformed_internal.end(), pair);
}
}
const RTMap& rt_ref = (rt_external.empty() ? transformed_internal :
rt_external);

double rt_min = feat.getMetaValue("leftWidth");
double rt_max = feat.getMetaValue("rightWidth");
if (mapping_tolerance_ > 0.0)
{
double abs_tol = mapping_tolerance_;
if (abs_tol < 1.0)
{
abs_tol *= (rt_max - rt_min);
}
rt_min -= abs_tol;
rt_max += abs_tol;
}
RTMap::const_iterator lower = rt_ref.lower_bound(rt_min);
RTMap::const_iterator upper = rt_ref.upper_bound(rt_max);
if (lower != upper) // there's at least one ID within the feature
{
feat.setMetaValue("rt_delta", 0.0);
}
else // check closest ID
{
double rt_delta1 = numeric_limits<double>::infinity();
if (lower != rt_ref.begin())
{
rt_delta1 = fabs((--lower)->first - rt_min);
}
double rt_delta2 = numeric_limits<double>::infinity();
if (upper != rt_ref.end())
{
rt_delta2 = fabs(upper->first - rt_min);
}
feat.setMetaValue("rt_delta", min(rt_delta1, rt_delta2));
}
}
++i;
}
// set of features from the last assay:
annotateFeaturesFinalizeAssay_(features, feat_ids,
ref_rt_map[peptide_ref].first);
// store unassigned peptide IDs from assays that did not generate any
// feature candidates:
for (PeptideRefRTMap::iterator ref_it = ref_rt_map.begin();
ref_it != ref_rt_map.end(); ++ref_it)
{
RTMap& rt_internal = ref_it->second.first;
if (!rt_internal.empty()) // not cleared by '...FinalizeAssay()'
{
for (RTMap::const_iterator rt_it = rt_internal.begin();
rt_it != rt_internal.end(); ++rt_it)
{
const PeptideIdentification& pep_id = *(rt_it->second);
features.getUnassignedPeptideIdentifications().push_back(pep_id);
}
}
}
}

Check notice on line 1395 in src/openms/source/TRANSFORMATIONS/FEATUREFINDER/FeatureFinderIdentificationAlgorithm.cpp

View check run for this annotation

codefactor.io / CodeFactor

src/openms/source/TRANSFORMATIONS/FEATUREFINDER/FeatureFinderIdentificationAlgorithm.cpp#L1214-L1395

Complex Method
void FeatureFinderIdentificationAlgorithm::ensureConvexHulls_(Feature& feature) const
{
if (feature.getConvexHulls().empty()) // add hulls for mass traces
Expand Down Expand Up @@ -1520,7 +1552,7 @@
double thresholds[2] = {counts[1] / float(counts[0]),
counts[0] / float(counts[1])};
// check middle values:
double rnd = rand() / double(RAND_MAX); // random num. in range 0-1

Check notice on line 1555 in src/openms/source/TRANSFORMATIONS/FEATUREFINDER/FeatureFinderIdentificationAlgorithm.cpp

View check run for this annotation

codefactor.io / CodeFactor

src/openms/source/TRANSFORMATIONS/FEATUREFINDER/FeatureFinderIdentificationAlgorithm.cpp#L1555

Consider using rand_r(...) instead of rand(...) for improved thread safety. (runtime/threadsafe_fn)
if (rnd < thresholds[middle->second.second])
{
training_labels[middle->second.first] = Int(middle->second.second);
Expand Down Expand Up @@ -1597,128 +1629,128 @@
training_labels.swap(temp);
}

void FeatureFinderIdentificationAlgorithm::classifyFeatures_(FeatureMap& features)
{
if (features.empty())
{
return;
}
if (features[0].metaValueExists("rt_delta")) // include RT feature
{
if (std::find(svm_predictor_names_.begin(), svm_predictor_names_.end(), "rt_delta") == svm_predictor_names_.end())
{
svm_predictor_names_.push_back("rt_delta");
}
}
// values for all features per predictor (this way around to simplify scaling
// of predictors):
SimpleSVM::PredictorMap predictors;
for (const String& pred : svm_predictor_names_)
{
predictors[pred].reserve(features.size());
for (Feature& feat : features)
{
if (!feat.metaValueExists(pred))
{
OPENMS_LOG_ERROR << "Meta value '" << pred << "' missing for feature '"
<< feat.getUniqueId() << "'" << endl;
predictors.erase(pred);
break;
}
predictors[pred].push_back(feat.getMetaValue(pred));
}
}

// get labels for SVM:
std::map<Size, double> training_labels;
bool no_selection = param_.getValue("svm:no_selection") == "true";
// mapping (for bias correction): intensity -> (index, positive?)
std::multimap<double, pair<Size, bool> > valid_obs;
Size n_obs[2] = {0, 0}; // counters for neg./pos. observations
for (Size feat_index = 0; feat_index < features.size(); ++feat_index)
{
String feature_class = features[feat_index].getMetaValue("feature_class");
int label = -1;
if (feature_class == "positive")
{
label = 1;
}
else if (feature_class == "negative")
{
label = 0;
}
if (label != -1)
{
++n_obs[label];
if (!no_selection)
{
double intensity = features[feat_index].getIntensity();
valid_obs.insert(make_pair(intensity, make_pair(feat_index,
bool(label))));
}
else
{
training_labels[feat_index] = (double)label;
}
}
}
checkNumObservations_(n_obs[1], n_obs[0]);

if (!no_selection)
{
getUnbiasedSample_(valid_obs, training_labels);
}
if (svm_n_samples_ > 0) // limited number of samples for training
{
if (training_labels.size() < svm_n_samples_)
{
OPENMS_LOG_WARN << "Warning: There are only " << training_labels.size()
<< " valid observations for training." << endl;
}
else if (training_labels.size() > svm_n_samples_)
{
getRandomSample_(training_labels);
}
}

SimpleSVM svm;
// set (only) the relevant parameters:
Param svm_params = svm.getParameters();
Logger::LogStream no_log; // suppress warnings about additional parameters
svm_params.update(param_.copy("svm:", true), false, no_log);
svm.setParameters(svm_params);
svm.setup(predictors, training_labels);
if (!svm_xval_out_.empty())
{
svm.writeXvalResults(svm_xval_out_);
}
if ((debug_level_ > 0) && svm_params.getValue("kernel") == "linear")
{
std::map<String, double> feature_weights;
svm.getFeatureWeights(feature_weights);
OPENMS_LOG_DEBUG << "SVM feature weights:" << endl;
for (std::map<String, double>::iterator it = feature_weights.begin();
it != feature_weights.end(); ++it)
{
OPENMS_LOG_DEBUG << "- " << it->first << ": " << it->second << endl;
}
}

std::vector<SimpleSVM::Prediction> predictions;
svm.predict(predictions);
OPENMS_POSTCONDITION(predictions.size() == features.size(),
"SVM predictions for all features expected");
for (Size i = 0; i < features.size(); ++i)
{
features[i].setMetaValue("predicted_class", predictions[i].outcome);
double prob_positive = predictions[i].probabilities[1];
features[i].setMetaValue("predicted_probability", prob_positive);
// @TODO: store previous (OpenSWATH) overall quality in a meta value?
features[i].setOverallQuality(prob_positive);
}
}


Check notice on line 1753 in src/openms/source/TRANSFORMATIONS/FEATUREFINDER/FeatureFinderIdentificationAlgorithm.cpp

View check run for this annotation

codefactor.io / CodeFactor

src/openms/source/TRANSFORMATIONS/FEATUREFINDER/FeatureFinderIdentificationAlgorithm.cpp#L1632-L1753

Complex Method
void FeatureFinderIdentificationAlgorithm::filterFeaturesFinalizeAssay_(Feature& best_feature, double best_quality,
const double quality_cutoff)
{
Expand Down
Loading