OpenMS · jpfeuffer · Oct 15, 2023 · Oct 15, 2023 · Oct 15, 2023 · Nov 8, 2023
diff --git a/src/openms/source/TRANSFORMATIONS/FEATUREFINDER/FeatureFinderIdentificationAlgorithm.cpp b/src/openms/source/TRANSFORMATIONS/FEATUREFINDER/FeatureFinderIdentificationAlgorithm.cpp
@@ -118,8 +118,10 @@
     defaults_.setMinFloat("add_mass_offset_peptides", 0.0);
 
     // available scores: initialPeakQuality,total_xic,peak_apices_sum,var_xcorr_coelution,var_xcorr_coelution_weighted,var_xcorr_shape,var_xcorr_shape_weighted,var_library_corr,var_library_rmsd,var_library_sangle,var_library_rootmeansquare,var_library_manhattan,var_library_dotprod,var_intensity_score,nr_peaks,sn_ratio,var_log_sn_score,var_elution_model_fit_score,xx_lda_prelim_score,var_isotope_correlation_score,var_isotope_overlap_score,var_massdev_score,var_massdev_score_weighted,var_bseries_score,var_yseries_score,var_dotprod_score,var_manhatt_score,main_var_xx_swath_prelim_score,xx_swath_prelim_score
-    // exclude some redundant/uninformative scores:
-    // @TODO: intensity bias introduced by "peak_apices_sum"?
+    // TODO: evaluate and exclude some redundant/uninformative scores:
+    //  - intensity bias introduced by "peak_apices_sum"?
+    //  - xx_lda_prelim_score is already a lin. comb. of other scores
+    //  - main_var_xx_swath_prelim_score is potentially the same as xx_lda_prelim_score
     // names of scores to use as SVM features
     String score_metavalues = "peak_apices_sum,var_xcorr_coelution,var_xcorr_shape,var_library_sangle,var_intensity_score,sn_ratio,var_log_sn_score,var_elution_model_fit_score,xx_lda_prelim_score,var_ms1_isotope_correlation_score,var_ms1_isotope_overlap_score,var_massdev_score,main_var_xx_swath_prelim_score";
 
@@ -363,319 +365,349 @@
    return seeds_added;
  }

  void FeatureFinderIdentificationAlgorithm::run(
    vector<PeptideIdentification> peptides,
    const vector<ProteinIdentification>& proteins,
    vector<PeptideIdentification> peptides_ext,
    vector<ProteinIdentification> proteins_ext,
    FeatureMap& features,
    const FeatureMap& seeds,
    const String& spectra_file
    )
  {
    if ((svm_n_samples_ > 0) && (svm_n_samples_ < 2 * svm_n_parts_))
    {
      String msg = "Sample size of " + String(svm_n_samples_) +
        " (parameter 'svm:samples') is not enough for " + String(svm_n_parts_) +
        "-fold cross-validation (parameter 'svm:xval').";
      throw Exception::InvalidParameter(__FILE__, __LINE__,
                                        OPENMS_PRETTY_FUNCTION, msg);
    }

    // annotate mzML file
    features.setPrimaryMSRunPath({spectra_file}, ms_data_);

    // initialize algorithm classes needed later:
    Param params = feat_finder_.getParameters();
    params.setValue("stop_report_after_feature", -1); // return all features
    params.setValue("EMGScoring:max_iteration", param_.getValue("EMGScoring:max_iteration"));
    params.setValue("EMGScoring:init_mom", param_.getValue("EMGScoring:init_mom"));
    params.setValue("Scores:use_rt_score", "false"); // RT may not be reliable
    params.setValue("Scores:use_ionseries_scores", "false"); // since FFID only uses MS1 spectra, this is useless
    params.setValue("Scores:use_ms2_isotope_scores", "false"); // since FFID only uses MS1 spectra, this is useless
    params.setValue("Scores:use_ms1_correlation", "false"); // this would be redundant to the "MS2" correlation and since
    // precursor transition = first product transition, additionally biased
     params.setValue("Scores:use_ms1_mi", "false"); // same as above. On MS1 level we basically only care about the "MS1 fullscan" scores
     //TODO for MS1 level scoring there is an additional parameter add_up_spectra with which we can add up spectra
     // around the apex, to complete isotopic envelopes (and therefore make this score more robust).
+
+    if (peptides_ext.empty()) // SVM scores not needed, disable the most expensive ones
+    {
+      // TODO this score also affects Swath LDA prescoring. I wonder if/how this impacts the
+      //  creation/addition of a feature.
+      params.setValue("Scores:use_elution_model_score", "false");
+    }
 
     if ((elution_model_ != "none") || (!candidates_out_.empty()))
     {
      params.setValue("write_convex_hull", "true");
    }
    if (min_peak_width_ < 1.0)
     {
       min_peak_width_ *= peak_width_;
     }
+
+    // TODO I wonder if the following parameters would be enough.
+    //  In theory we only care for one feature per one set of extracted chromatograms (transition group)
+    //params.setValue("stop_report_after_feature", 1); // best by quality, after scoring
+    //params.setValue("TransitionGroupPicker:stop_after_feature", 1); // best by intensity, after picking, before scoring
+
     params.setValue("TransitionGroupPicker:PeakPickerChromatogram:gauss_width",
                     peak_width_);
     params.setValue("TransitionGroupPicker:min_peak_width", min_peak_width_);
    // disabling the signal-to-noise threshold (setting the parameter to zero)
    // totally breaks the OpenSWATH feature detection (no features found)!
    params.setValue("TransitionGroupPicker:PeakPickerChromatogram:signal_to_noise",
                     signal_to_noise_);
     params.setValue("TransitionGroupPicker:recalculate_peaks", "true");
     params.setValue("TransitionGroupPicker:PeakPickerChromatogram:peak_width", -1.0);
-    params.setValue("TransitionGroupPicker:PeakPickerChromatogram:method",
-                    "corrected");    
+    params.setValue("TransitionGroupPicker:PeakPickerChromatogram:method", "corrected"); // default
+
+    // ======================================================================================================================================
+    // TODO: The following group of parameters probably only have an effect if ElutionModelFitter at the end of FFID
+    //  (in post_process_) is deactivated with 'elution_model = "none"'.
+    //  Otherwise ElutionModelFitter fits a new model (EGH instead of EMG) that takes into account all traces per feature
+    //  at the same time and overwrites intensities. "Old" SWATH intensities might be available in the feature as metavalue
+    //  "raw_intensity" though.
+
+    // Note: this EMG fitting only affects integration, not scoring
+    params.setValue("TransitionGroupPicker:PeakIntegrator:fit_EMG", "false"); // default (= disabled)
+    // Note: Fitting for scoring is controlled via EMGScoring parameter group. Fitting for integration here uses a completely
+    //  different algorithm (EmgGradientDescent class; params not exposed) with gradient descent instead of Eigen::LevenbergMarquardt algorithm (EmgFitter1D class).
+    // TODO: evaluate difference in runtimes and quality of result
+    params.setValue("TransitionGroupPicker:PeakIntegrator:integration_type", "intensity_sum"); // default
+    params.setValue("TransitionGroupPicker:background_subtraction", "exact"); // enable (default was "none")
+    params.setValue("TransitionGroupPicker:PeakIntegrator:baseline_type", "base_to_base"); // default
+    // ======================================================================================================================================
+
     params.setValue("TransitionGroupPicker:PeakPickerChromatogram:write_sn_log_messages", "false"); // disabled in OpenSWATH
 
     feat_finder_.setParameters(params);
    feat_finder_.setLogType(ProgressLogger::NONE);
    feat_finder_.setStrictFlag(false);
    // to use MS1 Swath scores:
    feat_finder_.setMS1Map(SimpleOpenMSSpectraFactory::getSpectrumAccessOpenMSPtr(boost::make_shared<MSExperiment>(ms_data_)));

    double rt_uncertainty(0);
    bool with_external_ids = !peptides_ext.empty();

    if (with_external_ids && !seeds.empty())
    {
      throw Exception::IllegalArgument(
        __FILE__,
        __LINE__,
        OPENMS_PRETTY_FUNCTION,
        "Using seeds and external ids is currently not supported.");
    }

    if (with_external_ids)
    {
      // align internal and external IDs to estimate RT shifts:
      MapAlignmentAlgorithmIdentification aligner;
      aligner.setReference(peptides_ext); // go from internal to external scale
      vector<vector<PeptideIdentification> > aligner_peptides(1, peptides);
      vector<TransformationDescription> aligner_trafos;

      OPENMS_LOG_INFO << "Realigning internal and external IDs...";
      aligner.align(aligner_peptides, aligner_trafos);
      trafo_external_ = aligner_trafos[0];
      vector<double> aligned_diffs;
      trafo_external_.getDeviations(aligned_diffs);
      Size index = std::max(Size(0), Size(rt_quantile_ * static_cast<double>(aligned_diffs.size())) - 1);
      rt_uncertainty = aligned_diffs[index];
      try
      {
        aligner_trafos[0].fitModel("lowess");
        trafo_external_ = aligner_trafos[0];
      }
      catch (Exception::BaseException& e)
      {
        OPENMS_LOG_ERROR << "Error: Failed to align RTs of internal/external peptides. RT information will not be considered in the SVM classification. The original error message was:\n" << e.what() << endl;
      }
    }

    if (rt_window_ == 0.0)
    {
      // calculate RT window based on other parameters and alignment quality:
      double map_tol = mapping_tolerance_;
      if (map_tol < 1.0)
      {
        map_tol *= (2 * peak_width_); // relative tolerance
      }
      rt_window_ = (rt_uncertainty + 2 * peak_width_ + map_tol) * 2;
      OPENMS_LOG_INFO << "RT window size calculated as " << rt_window_ << " seconds."
               << endl;
    }

    //-------------------------------------------------------------
    // prepare peptide map
    //-------------------------------------------------------------
    OPENMS_LOG_INFO << "Preparing mapping of peptide data..." << endl;
    peptide_map_.clear();

    // Reserve enough space for all possible seeds
    {
      Size max_size = peptides.size() + seeds.size();
      if (add_mass_offset_peptides_)
      {
        max_size *= 2;
      }
      peptides.reserve(max_size);
    }

    for (auto& pep : peptides)
    {
      addPeptideToMap_(pep, peptide_map_);
      pep.setMetaValue("FFId_category", "internal");
    }

    // TODO make sure that only assembled traces (more than one trace -> has a charge) if FFMetabo is used
    // see FeatureFindingMetabo: defaults_.setValue("remove_single_traces", "false", "Remove unassembled traces (single traces).");
    Size seeds_added = addSeeds_(peptides, seeds);
    OPENMS_LOG_INFO << "#Seeds without RT and m/z overlap with identified peptides added: " << seeds_added << endl;

    if (add_mass_offset_peptides_ > 0.0)
    {
      Size n_added = addOffsetPeptides_(peptides, add_mass_offset_peptides_);
      OPENMS_LOG_INFO << "#Offset peptides without RT and m/z overlap with other peptides added: " << n_added << endl;
    }

    n_internal_peps_ = peptide_map_.size();
    for (PeptideIdentification& pep : peptides_ext)
    {
      addPeptideToMap_(pep, peptide_map_, true);
      pep.setMetaValue("FFId_category", "external");
    }
    n_external_peps_ = peptide_map_.size() - n_internal_peps_;

    boost::shared_ptr<PeakMap> shared = boost::make_shared<PeakMap>(ms_data_);
    OpenSwath::SpectrumAccessPtr spec_temp =
        SimpleOpenMSSpectraFactory::getSpectrumAccessOpenMSPtr(shared);
    auto chunks = chunk_(peptide_map_.begin(), peptide_map_.end(), batch_size_);

    PeptideRefRTMap ref_rt_map;
    if (debug_level_ >= 668)
    {
      OPENMS_LOG_INFO << "Creating full assay library for debugging." << endl;
      // Warning: this step is pretty inefficient, since it does the whole library generation twice
      // Really use for debug only
      createAssayLibrary_(peptide_map_.begin(), peptide_map_.end(), ref_rt_map, false);
      cout << "Writing debug.traml file." << endl;
      FileHandler().storeTransitions("debug.traml", library_);
      ref_rt_map.clear();
      library_.clear(true);
    }

    //-------------------------------------------------------------
    // run feature detection
    //-------------------------------------------------------------
    //Note: progress only works in non-debug when no logs come in-between
    getProgressLogger().startProgress(0, chunks.size(), "Creating assay library and extracting chromatograms");
    Size chunk_count = 0;
    for (auto& chunk : chunks)
    {
      //TODO since ref_rt_map is only used after chunking, we could create
      // maps per chunk and merge them in the end. Would help in parallelizing as well.
      createAssayLibrary_(chunk.first, chunk.second, ref_rt_map);
      OPENMS_LOG_DEBUG << "#Transitions: " << library_.getTransitions().size() << endl;

      ChromatogramExtractor extractor;
      // extractor.setLogType(ProgressLogger::NONE);
      {
        vector<OpenSwath::ChromatogramPtr> chrom_temp;
        vector<ChromatogramExtractor::ExtractionCoordinates> coords;
        // take entries in library_ and put to chrom_temp and coords
        extractor.prepare_coordinates(chrom_temp, coords, library_,
                                      numeric_limits<double>::quiet_NaN(), false);


        extractor.extractChromatograms(spec_temp, chrom_temp, coords, mz_window_,
                                       mz_window_ppm_, "tophat");
        extractor.return_chromatogram(chrom_temp, coords, library_, (*shared)[0],
                                      chrom_data_.getChromatograms(), false);
      }

      OPENMS_LOG_DEBUG << "Extracted " << chrom_data_.getNrChromatograms()
                       << " chromatogram(s)." << endl;

      OPENMS_LOG_DEBUG << "Detecting chromatographic peaks..." << endl;
      // suppress status output from OpenSWATH, unless in debug mode:
      if (debug_level_ < 1)
      {
        OpenMS_Log_info.remove(cout);
      }
      feat_finder_.pickExperiment(chrom_data_, features, library_,
                                  TransformationDescription(), ms_data_);
      if (debug_level_ < 1)
      {
        OpenMS_Log_info.insert(cout); // revert logging change
      }
      chrom_data_.clear(true);
      library_.clear(true);
      // since chrom_data_ here is just a container for the chromatograms and identifications will be empty,
      // pickExperiment above will only add empty ProteinIdentification runs with colliding identifiers.
      // Usually we could sanitize the identifiers or merge the runs, but since they are empty and we add the
      // "real" proteins later -> just clear them
      features.getProteinIdentifications().clear();
      getProgressLogger().setProgress(++chunk_count);
    }
    getProgressLogger().endProgress();

    OPENMS_LOG_INFO << "Found " << features.size() << " feature candidates in total."
                    << endl;

    ms_data_.reset(); // not needed anymore, free up the memory
    // complete feature annotation:
    annotateFeatures_(features, ref_rt_map);

    // sort everything:
    sort(features.getUnassignedPeptideIdentifications().begin(),
         features.getUnassignedPeptideIdentifications().end(),
         peptide_compare_);
    sort(features.begin(), features.end(), feature_compare_);

    postProcess_(features, with_external_ids);
    statistics_(features);

    features.setProteinIdentifications(proteins);
    // add external IDs (if any):
    features.getProteinIdentifications().insert(
      features.getProteinIdentifications().end(), proteins_ext.begin(),
      proteins_ext.end());
    features.getUnassignedPeptideIdentifications().insert(
      features.getUnassignedPeptideIdentifications().end(),
      peptides_ext.begin(), peptides_ext.end());

    // remove all hits with pseudo ids (seeds)
    for (Feature& f : features)
    {
      std::vector<PeptideIdentification>& ids = f.getPeptideIdentifications();

      // if we have peptide identifications assigned and all are annotated as OffsetPeptide, we mark the feature is also an OffsetPeptide
      if (!ids.empty() && std::all_of(ids.begin(), ids.end(), [](const PeptideIdentification & pid) { return pid.metaValueExists("OffsetPeptide"); }))
      {
        f.setMetaValue("OffsetPeptide", "true");
      }

      // remove all hits (PSM details)
      for (auto & pid : ids)
      {
        std::vector<PeptideHit>& hits = pid.getHits();
        auto it = remove_if(hits.begin(), hits.end(),
          [](const PeptideHit & ph)
          {
            return (ph.getSequence().toUnmodifiedString().hasPrefix("XXX"));
          });
        hits.erase(it, hits.end()); // remove / erase idiom
      }

      // remove empty PeptideIdentifications
      auto it = remove_if(ids.begin(), ids.end(),
        [](const PeptideIdentification & pid)
        {
          return pid.empty();
        });
      ids.erase(it, ids.end()); // remove / erase idiom
    }

    // clean up unassigned PeptideIdentifications
    std::vector<PeptideIdentification>& ids = features.getUnassignedPeptideIdentifications();
    for (auto & pid : ids)
    {
      std::vector<PeptideHit>& hits = pid.getHits();
      auto it = remove_if(hits.begin(), hits.end(), [](const PeptideHit & ph)
      {
        return (ph.getSequence().toUnmodifiedString().hasPrefix("XXX"));
      });
      hits.erase(it, hits.end());
    }

    // remove empty PeptideIdentifications
    auto it = remove_if(ids.begin(), ids.end(),
      [](const PeptideIdentification & pid)
      {
        return pid.empty();
      });
    ids.erase(it, ids.end()); // remove / erase idiom

    // add back ignored PSMs
    features.getUnassignedPeptideIdentifications().insert(features.getUnassignedPeptideIdentifications().end(),
                                                          std::move_iterator(unassignedIDs_.begin()),
                                                          std::move_iterator(unassignedIDs_.end()));

    features.ensureUniqueId();
  }

  void FeatureFinderIdentificationAlgorithm::postProcess_(
   FeatureMap & features,
   bool with_external_ids)
@@ -727,7 +759,7 @@
        }
      }
    }

  }

  void FeatureFinderIdentificationAlgorithm::runOnCandidates(FeatureMap & features)
@@ -781,7 +813,7 @@
    postProcess_(features, with_external_ids);

    statistics_(features);

  }

  void FeatureFinderIdentificationAlgorithm::statistics_(FeatureMap const & features) const
@@ -821,184 +853,184 @@
             << " peptides without features ("
             << n_internal_peps_ - quantified_internal.size() << " internal, "
             << n_missing_external << " external)\n" << endl;

  }

  void FeatureFinderIdentificationAlgorithm::createAssayLibrary_(const PeptideMap::iterator& begin, const PeptideMap::iterator& end, PeptideRefRTMap& ref_rt_map, bool clear_IDs)
  {
    std::set<String> protein_accessions;

    Size seedcount = 0;
    for (auto pm_it = begin;
         pm_it != end; ++pm_it)
    {
      TargetedExperiment::Peptide peptide;
      const AASequence &seq = pm_it->first;


      // @NOTE: Technically, "TargetedExperiment::Peptide" stores the unmodified
      // sequence and the modifications separately. Unfortunately, creating the
      // modifications vector is complex and there is currently no convenient
      // conversion function (see "TargetedExperimentHelper::getAASequence" for
      // the reverse conversion). However, "Peptide" is later converted to
      // "OpenSwath::LightPeptide" anyway, and this is done via "AASequence"
      // (see "OpenSwathDataAccessHelper::convertTargetedPeptide"). So for our
      // purposes it works to just store the sequence including modifications in
      // "Peptide".

      // for now, seeds are stored in the same PeptideRefMap, all
      // under the same fake sequence key entry
      // TODO add own data structure for them
      if (seq.toUnmodifiedString().hasPrefix("XXX")) // seed
      {
        // This will force the SWATH scores to consider it like an unidentified peptide and e.g. use averagine isotopes
        peptide.sequence = "";
        // we do not have to aggregate their retention times, therefore just
        // iterate over the entries
        const ChargeMap& cm = pm_it->second;
        for (const auto& charge_rtmap : cm)
        {
          Int charge = charge_rtmap.first;
          // only go through internals for seeds (->first). External seeds are not supported
          for (const auto& rt_pep : charge_rtmap.second.first)
          {
            // since we don't know their IDs, seeds will all need a different grouplabel in SWATH
            // to not be combined
            seedcount++;

            double mz = rt_pep.second->getMZ();
            double rt = rt_pep.second->getRT();
            String uid = rt_pep.second->getMetaValue("SeedFeatureID");

            // UID should be enough, but let's add the seed count to be sure.
            String peptide_id = peptide.sequence + "[" + uid + "][" + String(seedcount) + "]/" + String(charge);
            peptide.setChargeState(charge);
            peptide.id = peptide_id;
            peptide.protein_refs = {"not_available"};
            peptide.setPeptideGroupLabel(peptide_id);

            //create an entry in the "output" ref_rt_map for internals
            RTMap &internal_ids = ref_rt_map[peptide_id].first;

            // get isotope distribution for peptide:
            //TODO Why 10? Document constant?
            Size n_isotopes = (isotope_pmin_ > 0.0) ? 10 : n_isotopes_;
            CoarseIsotopePatternGenerator generator(n_isotopes);
            IsotopeDistribution iso_dist = generator
                .estimateFromPeptideWeight(mz * charge - charge * Constants::PROTON_MASS_U);
            if (isotope_pmin_ > 0.0)
            {
              iso_dist.trimLeft(isotope_pmin_);
              iso_dist.trimRight(isotope_pmin_);
              iso_dist.renormalize();
            }

            double rt_tolerance = seed_rt_window_ / 2.0;

            // store beginning and end of RT region: here we only need one entry
            peptide.rts.clear();
            addPeptideRT_(peptide, rt - rt_tolerance);
            addPeptideRT_(peptide, rt + rt_tolerance);
            library_.addPeptide(peptide);
            generateTransitions_(peptide.id, mz, charge, iso_dist);
            internal_ids.emplace(rt_pep);
          }
        }
      }
      else
      {
        peptide.sequence = seq.toString();
        // keep track of protein accessions:
        set<String> current_accessions;
        // internal/external pair
        const pair<RTMap, RTMap> &pair = pm_it->second.begin()->second;

        // WARNING: This assumes that at least one hit is present.
        const PeptideHit &hit = (pair.first.empty() ?
                                 pair.second.begin()->second->getHits()[0] :
                                 pair.first.begin()->second->getHits()[0]);
        current_accessions = hit.extractProteinAccessionsSet();
        protein_accessions.insert(current_accessions.begin(),
                                  current_accessions.end());
        // missing protein accession would crash OpenSWATH algorithms:
        if (current_accessions.empty())
        {
          current_accessions.insert("not_available");
        }

        peptide.protein_refs = vector<String>(current_accessions.begin(),
                                              current_accessions.end());
        // get regions in which peptide eludes (ideally only one):
        std::vector<RTRegion> rt_regions;
        getRTRegions_(pm_it->second, rt_regions, clear_IDs);

        // get isotope distribution for peptide:
        Size n_isotopes = (isotope_pmin_ > 0.0) ? 10 : n_isotopes_;
        IsotopeDistribution iso_dist =
            seq.getFormula(Residue::Full, 0).getIsotopeDistribution(CoarseIsotopePatternGenerator(n_isotopes));
        if (isotope_pmin_ > 0.0)
        {
          iso_dist.trimLeft(isotope_pmin_);
          iso_dist.trimRight(isotope_pmin_);
          iso_dist.renormalize();
        }

        // go through different charge states:
        for (ChargeMap::const_iterator cm_it = pm_it->second.begin();
             cm_it != pm_it->second.end(); ++cm_it)
        {
          Int charge = cm_it->first;

          double mz = seq.getMZ(charge);
          OPENMS_LOG_DEBUG << "\nPeptide " << peptide.sequence << "/" << charge << " (m/z: " << mz << "):" << endl;
          peptide.setChargeState(charge);
          String peptide_id = peptide.sequence + "/" + String(charge);

          // we want to detect one feature per peptide and charge state - if there
          // are multiple RT regions, group them together:
          peptide.setPeptideGroupLabel(peptide_id);
          peptide.rts.clear();
          Size counter = 0;
          // accumulate IDs over multiple regions:
          RTMap &internal_ids = ref_rt_map[peptide_id].first;
          RTMap &external_ids = ref_rt_map[peptide_id].second;
          for (RTRegion& reg : rt_regions)
          {
            if (reg.ids.count(charge))
            {
              OPENMS_LOG_DEBUG_NOFILE << "Charge " << charge << ", Region# " << counter + 1 << " (RT: "
                               << float(reg.start) << "-" << float(reg.end)
                               << ", size " << float(reg.end - reg.start) << ")"
                               << endl;

              peptide.id = peptide_id;
              if (rt_regions.size() > 1)
                peptide.id += ":" + String(++counter);

              // store beginning and end of RT region:
              peptide.rts.clear();
              addPeptideRT_(peptide, reg.start);
              addPeptideRT_(peptide, reg.end);
              library_.addPeptide(peptide);
              generateTransitions_(peptide.id, mz, charge, iso_dist);
            }
            internal_ids.insert(reg.ids[charge].first.begin(),
                                reg.ids[charge].first.end());
            external_ids.insert(reg.ids[charge].second.begin(),
                                reg.ids[charge].second.end());
          }
        }
      }
    }
    // add proteins to library:
    for (String const &acc : protein_accessions)
    {
      TargetedExperiment::Protein protein;
      protein.id = acc;
      library_.addProtein(protein);
    }
  }

  void FeatureFinderIdentificationAlgorithm::getRTRegions_(
    ChargeMap& peptide_data,
    std::vector<RTRegion>& rt_regions,
@@ -1179,188 +1211,188 @@
  }

  /// annotate identified features with m/z, isotope probabilities, etc.
  void FeatureFinderIdentificationAlgorithm::annotateFeatures_(FeatureMap& features, PeptideRefRTMap& ref_rt_map)
  {
    String previous_ref, peptide_ref;
    RTMap transformed_internal;
    Size i = 0;
    map<Size, vector<PeptideIdentification*> > feat_ids;
    for (Feature& feat : features)
    {
      feat.setMZ(feat.getMetaValue("PrecursorMZ"));
      feat.setCharge(feat.getPeptideIdentifications()[0].getHits()[0].
                         getCharge());
      ensureConvexHulls_(feat);
      // remove "fake" IDs generated by OpenSWATH (they would be removed with
      // a warning when writing output, because of missing protein
      // identification with corresponding identifier):
      feat.getPeptideIdentifications().clear();
      // annotate subordinates with theoretical isotope intensities:
      for (Feature& sub : feat.getSubordinates())
      {
        String native_id = sub.getMetaValue("native_id");
        sub.setMetaValue("isotope_probability", isotope_probs_[native_id]);
      }

      peptide_ref = feat.getMetaValue("PeptideRef");
      // remove region number, if present:
      Size pos_slash = peptide_ref.rfind('/');
      Size pos_colon = peptide_ref.find(':', pos_slash + 2);
      peptide_ref = peptide_ref.substr(0, pos_colon);

      if (peptide_ref != previous_ref)
      {
        if (!previous_ref.empty())
        {
          annotateFeaturesFinalizeAssay_(
            features, feat_ids, ref_rt_map[previous_ref].first);
        }
        previous_ref = peptide_ref;
      }

      RTMap& rt_internal = ref_rt_map[peptide_ref].first;
      RTMap& rt_external = ref_rt_map[peptide_ref].second;

      if (rt_internal.empty() && rt_external.empty())
      {
        OPENMS_LOG_DEBUG << "PeptideRefs in RTMap:" << endl;
        for (const auto& rtm : ref_rt_map)
        {
          OPENMS_LOG_DEBUG << rtm.first << endl;
        }

        throw Exception::IllegalArgument(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, "RT internal and external are both empty for peptide '" + String(peptide_ref) + "' stored as '" + String(feat.getMetaValue("PeptideRef")) + "'.");
      }

      if (!rt_internal.empty()) // validate based on internal IDs
      {
        // map IDs to features (based on RT):
        double rt_min = features[i].getMetaValue("leftWidth");
        double rt_max = features[i].getMetaValue("rightWidth");
        if (mapping_tolerance_ > 0.0)
        {
          double abs_tol = mapping_tolerance_;
          if (abs_tol < 1.0)
          {
            abs_tol *= (rt_max - rt_min);
          }
          rt_min -= abs_tol;
          rt_max += abs_tol;
        }
        RTMap::const_iterator lower = rt_internal.lower_bound(rt_min);
        RTMap::const_iterator upper = rt_internal.upper_bound(rt_max);
        int id_count = 0;
        for (; lower != upper; ++lower)
        {
          feat_ids[i].push_back(lower->second);
          ++id_count;
        }
        // "total" only includes IDs from this RT region:
        feat.setMetaValue("n_total_ids", rt_internal.size());
        feat.setMetaValue("n_matching_ids", id_count);
        if (id_count > 0) // matching IDs -> feature may be correct
        {
          feat.setMetaValue("feature_class", "ambiguous");
        }
        else // no matching IDs -> feature is wrong
        {
          feat.setMetaValue("feature_class", "negative");
        }
      }
      else // only external IDs -> no validation possible
      {
        feat.setMetaValue("n_total_ids", 0);
        feat.setMetaValue("n_matching_ids", -1);
        feat.setMetaValue("feature_class", "unknown");
        // add "dummy" peptide identification:
        PeptideIdentification id = *(rt_external.begin()->second);
        id.clearMetaInfo();
        id.setMetaValue("FFId_category", "implied");
        id.setRT(feat.getRT());
        id.setMZ(feat.getMZ());
        // only one peptide hit per ID - see function "addPeptideToMap_":
        PeptideHit& hit = id.getHits()[0];
        hit.clearMetaInfo();
        hit.setScore(0.0);
        feat.getPeptideIdentifications().push_back(id);
      }

      // distance from feature to closest peptide ID:
      if (!trafo_external_.getDataPoints().empty())
      {
        // use external IDs if available, otherwise RT-transformed internal IDs
        // (but only compute the transform if necessary, once per assay!):
        if (rt_external.empty() && (transformed_internal.empty() ||
                                    (peptide_ref != previous_ref)))
        {
          transformed_internal.clear();
          for (RTMap::const_iterator it = rt_internal.begin();
               it != rt_internal.end(); ++it)
          {
            double transformed_rt = trafo_external_.apply(it->first);
            RTMap::value_type pair = make_pair(transformed_rt, it->second);
            transformed_internal.insert(transformed_internal.end(), pair);
          }
        }
        const RTMap& rt_ref = (rt_external.empty() ? transformed_internal :
                               rt_external);

        double rt_min = feat.getMetaValue("leftWidth");
        double rt_max = feat.getMetaValue("rightWidth");
        if (mapping_tolerance_ > 0.0)
        {
          double abs_tol = mapping_tolerance_;
          if (abs_tol < 1.0)
          {
            abs_tol *= (rt_max - rt_min);
          }
          rt_min -= abs_tol;
          rt_max += abs_tol;
        }
        RTMap::const_iterator lower = rt_ref.lower_bound(rt_min);
        RTMap::const_iterator upper = rt_ref.upper_bound(rt_max);
        if (lower != upper) // there's at least one ID within the feature
        {
          feat.setMetaValue("rt_delta", 0.0);
        }
        else // check closest ID
        {
          double rt_delta1 = numeric_limits<double>::infinity();
          if (lower != rt_ref.begin())
          {
            rt_delta1 = fabs((--lower)->first - rt_min);
          }
          double rt_delta2 = numeric_limits<double>::infinity();
          if (upper != rt_ref.end())
          {
            rt_delta2 = fabs(upper->first - rt_min);
          }
          feat.setMetaValue("rt_delta", min(rt_delta1, rt_delta2));
        }
      }
      ++i;
    }
    // set of features from the last assay:
    annotateFeaturesFinalizeAssay_(features, feat_ids,
                                   ref_rt_map[peptide_ref].first);
    // store unassigned peptide IDs from assays that did not generate any
    // feature candidates:
    for (PeptideRefRTMap::iterator ref_it = ref_rt_map.begin();
         ref_it != ref_rt_map.end(); ++ref_it)
    {
      RTMap& rt_internal = ref_it->second.first;
      if (!rt_internal.empty()) // not cleared by '...FinalizeAssay()'
      {
        for (RTMap::const_iterator rt_it = rt_internal.begin();
             rt_it != rt_internal.end(); ++rt_it)
        {
          const PeptideIdentification& pep_id = *(rt_it->second);
          features.getUnassignedPeptideIdentifications().push_back(pep_id);
        }
      }
    }
  }

  void FeatureFinderIdentificationAlgorithm::ensureConvexHulls_(Feature& feature) const
  {
    if (feature.getConvexHulls().empty()) // add hulls for mass traces
@@ -1520,7 +1552,7 @@
        double thresholds[2] = {counts[1] / float(counts[0]),
                                counts[0] / float(counts[1])};
        // check middle values:
        double rnd = rand() / double(RAND_MAX); // random num. in range 0-1
        if (rnd < thresholds[middle->second.second])
        {
          training_labels[middle->second.first] = Int(middle->second.second);
@@ -1597,128 +1629,128 @@
    training_labels.swap(temp);
  }

  void FeatureFinderIdentificationAlgorithm::classifyFeatures_(FeatureMap& features)
  {
    if (features.empty())
    {
      return;
    }
    if (features[0].metaValueExists("rt_delta")) // include RT feature
    {
      if (std::find(svm_predictor_names_.begin(), svm_predictor_names_.end(), "rt_delta") == svm_predictor_names_.end())
      {
        svm_predictor_names_.push_back("rt_delta");
      }
    }
    // values for all features per predictor (this way around to simplify scaling
    // of predictors):
    SimpleSVM::PredictorMap predictors;
    for (const String& pred : svm_predictor_names_)
    {
      predictors[pred].reserve(features.size());
      for (Feature& feat : features)
      {
        if (!feat.metaValueExists(pred))
        {
          OPENMS_LOG_ERROR << "Meta value '" << pred << "' missing for feature '"
                    << feat.getUniqueId() << "'" << endl;
          predictors.erase(pred);
          break;
        }
        predictors[pred].push_back(feat.getMetaValue(pred));
      }
    }

    // get labels for SVM:
    std::map<Size, double> training_labels;
    bool no_selection = param_.getValue("svm:no_selection") == "true";
    // mapping (for bias correction): intensity -> (index, positive?)
    std::multimap<double, pair<Size, bool> > valid_obs;
    Size n_obs[2] = {0, 0}; // counters for neg./pos. observations
    for (Size feat_index = 0; feat_index < features.size(); ++feat_index)
    {
      String feature_class = features[feat_index].getMetaValue("feature_class");
      int label = -1;
      if (feature_class == "positive")
      {
        label = 1;
      }
      else if (feature_class == "negative")
      {
        label = 0;
      }
      if (label != -1)
      {
        ++n_obs[label];
        if (!no_selection)
        {
          double intensity = features[feat_index].getIntensity();
          valid_obs.insert(make_pair(intensity, make_pair(feat_index,
                                                          bool(label))));
        }
        else
        {
          training_labels[feat_index] = (double)label;
        }
      }
    }
    checkNumObservations_(n_obs[1], n_obs[0]);

    if (!no_selection)
    {
      getUnbiasedSample_(valid_obs, training_labels);
    }
    if (svm_n_samples_ > 0) // limited number of samples for training
    {
      if (training_labels.size() < svm_n_samples_)
      {
        OPENMS_LOG_WARN << "Warning: There are only " << training_labels.size()
                 << " valid observations for training." << endl;
      }
      else if (training_labels.size() > svm_n_samples_)
      {
        getRandomSample_(training_labels);
      }
    }

    SimpleSVM svm;
    // set (only) the relevant parameters:
    Param svm_params = svm.getParameters();
    Logger::LogStream no_log; // suppress warnings about additional parameters
    svm_params.update(param_.copy("svm:", true), false, no_log);
    svm.setParameters(svm_params);
    svm.setup(predictors, training_labels);
    if (!svm_xval_out_.empty())
    {
      svm.writeXvalResults(svm_xval_out_);
    }
    if ((debug_level_ > 0) && svm_params.getValue("kernel") == "linear")
    {
      std::map<String, double> feature_weights;
      svm.getFeatureWeights(feature_weights);
      OPENMS_LOG_DEBUG << "SVM feature weights:" << endl;
      for (std::map<String, double>::iterator it = feature_weights.begin();
           it != feature_weights.end(); ++it)
      {
        OPENMS_LOG_DEBUG << "- " << it->first << ": " << it->second << endl;
      }
    }

    std::vector<SimpleSVM::Prediction> predictions;
    svm.predict(predictions);
    OPENMS_POSTCONDITION(predictions.size() == features.size(), 
                         "SVM predictions for all features expected");
    for (Size i = 0; i < features.size(); ++i)
    {
      features[i].setMetaValue("predicted_class", predictions[i].outcome);
      double prob_positive = predictions[i].probabilities[1];
      features[i].setMetaValue("predicted_probability", prob_positive);
      // @TODO: store previous (OpenSWATH) overall quality in a meta value?
      features[i].setOverallQuality(prob_positive);
    }
  }


  void FeatureFinderIdentificationAlgorithm::filterFeaturesFinalizeAssay_(Feature& best_feature, double best_quality,
                                    const double quality_cutoff)
  {