Skip to content

Commit

Permalink
Merge pull request #110 from singjc/feature/dynamic_main_score
Browse files Browse the repository at this point in the history
Feature/dynamic main score
  • Loading branch information
grosenberger committed Dec 15, 2022
2 parents 420f17b + c192d7f commit 265fca4
Show file tree
Hide file tree
Showing 9 changed files with 210 additions and 56 deletions.
2 changes: 1 addition & 1 deletion pyprophet/classifiers.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ def objective(params):
'scale_pos_weight': "{:.3f}".format(params['scale_pos_weight']),
}

clf = xgb.XGBClassifier(random_state=42, silent=1, objective='binary:logitraw', eval_metric='auc', **params)
clf = xgb.XGBClassifier(random_state=42, verbosity=0, objective='binary:logitraw', eval_metric='auc', **params)

score = cross_val_score(clf, X, y, scoring='roc_auc', n_jobs=self.threads, cv=KFold(n_splits=3, shuffle=True, random_state=np.random.RandomState(42))).mean()
# click.echo("Info: AUC: {:.3f} hyperparameters: {}".format(score, params))
Expand Down
27 changes: 26 additions & 1 deletion pyprophet/data_handling.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,32 @@ def prepare_data_table(table,
df = cleanup_and_check(df)
return df, all_score_columns


def update_chosen_main_score_in_table(train, score_columns, use_as_main_score):
"""
Update feature tables main_score
"""
# Get current main score column name
old_main_score_column = [col for col in score_columns if 'main' in col][0]
# Get tables aliased score variable name
df_column_score_alias = [col for col in train.df.columns if col not in ['tg_id', 'tg_num_id', 'is_decoy', 'is_top_peak', 'is_train', 'classifier_score']]
# Generate mapping to rename columns in table
mapper = {alias_col : col for alias_col, col in zip(df_column_score_alias, score_columns)}
# Rename columns with actual feature score names
train.df.rename(columns=mapper, inplace=True)
# Update coulmns to set new main score column based on most important feature column
updated_score_columns = [col.replace("main_", "") if col==old_main_score_column else col for col in score_columns]
updated_score_columns = [col.replace("var", "main_var") if col==use_as_main_score else col for col in updated_score_columns]
updated_score_columns = sorted(updated_score_columns, key=lambda x:(x!=use_as_main_score.replace("var", "main_var"), x))
updated_score_columns = [old_main_score_column if old_main_score_column.replace("main_", "")==col else col for col in updated_score_columns]
# Rename columns with feature aliases
mapper = {v : 'var_{0}'.format(i) for i, v in enumerate(updated_score_columns[1:len(updated_score_columns)])}
mapper[updated_score_columns[0].replace("main_", "")] = 'main_score'
train.df.rename(columns=mapper, inplace=True)
# Re-order main_score column index
temp_col = train.df.pop('main_score')
train.df.insert(5, temp_col.name, temp_col)
click.echo(f"Info: Updated main score column from {old_main_score_column} to {use_as_main_score}...")
return train, tuple(updated_score_columns)
class Experiment(object):

@profile
Expand Down
10 changes: 6 additions & 4 deletions pyprophet/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,10 +87,11 @@ def type_cast_value(self, ctx, value):
@click.option('--tric_chromprob/--no-tric_chromprob', default=False, show_default=True, help='Whether chromatogram probabilities for TRIC should be computed.')
# Visualization
@click.option('--color_palette', default='normal', show_default=True, type=click.Choice(['normal', 'protan', 'deutran', 'tritan']), help='Color palette to use in reports.')
@click.option('--main_score_selection_report/--no-main_score_selection_report', default=False, show_default=True, help='Generate a report for main score selection process.')
# Processing
@click.option('--threads', default=1, show_default=True, type=int, help='Number of threads used for semi-supervised learning. -1 means all available CPUs.', callback=transform_threads)
@click.option('--test/--no-test', default=False, show_default=True, help='Run in test mode with fixed seed.')
def score(infile, outfile, classifier, xgb_autotune, apply_weights, xeval_fraction, xeval_num_iter, ss_initial_fdr, ss_iteration_fdr, ss_num_iter, ss_main_score, group_id, parametric, pfdr, pi0_lambda, pi0_method, pi0_smooth_df, pi0_smooth_log_pi0, lfdr_truncate, lfdr_monotone, lfdr_transformation, lfdr_adj, lfdr_eps, level, ipf_max_peakgroup_rank, ipf_max_peakgroup_pep, ipf_max_transition_isotope_overlap, ipf_min_transition_sn, tric_chromprob, threads, test, ss_score_filter, color_palette):
def score(infile, outfile, classifier, xgb_autotune, apply_weights, xeval_fraction, xeval_num_iter, ss_initial_fdr, ss_iteration_fdr, ss_num_iter, ss_main_score, group_id, parametric, pfdr, pi0_lambda, pi0_method, pi0_smooth_df, pi0_smooth_log_pi0, lfdr_truncate, lfdr_monotone, lfdr_transformation, lfdr_adj, lfdr_eps, level, ipf_max_peakgroup_rank, ipf_max_peakgroup_pep, ipf_max_transition_isotope_overlap, ipf_min_transition_sn, tric_chromprob, threads, test, ss_score_filter, color_palette, main_score_selection_report):
"""
Conduct semi-supervised learning and error-rate estimation for MS1, MS2 and transition-level data.
"""
Expand All @@ -103,13 +104,14 @@ def score(infile, outfile, classifier, xgb_autotune, apply_weights, xeval_fracti
# Prepare XGBoost-specific parameters
xgb_hyperparams = {'autotune': xgb_autotune, 'autotune_num_rounds': 10, 'num_boost_round': 100, 'early_stopping_rounds': 10, 'test_size': 0.33}

xgb_params = {'eta': 0.3, 'gamma': 0, 'max_depth': 6, 'min_child_weight': 1, 'subsample': 1, 'colsample_bytree': 1, 'colsample_bylevel': 1, 'colsample_bynode': 1, 'lambda': 1, 'alpha': 0, 'scale_pos_weight': 1, 'silent': 1, 'objective': 'binary:logitraw', 'nthread': 1, 'eval_metric': 'auc'}
xgb_params = {'eta': 0.3, 'gamma': 0, 'max_depth': 6, 'min_child_weight': 1, 'subsample': 1, 'colsample_bytree': 1, 'colsample_bylevel': 1, 'colsample_bynode': 1, 'lambda': 1, 'alpha': 0, 'scale_pos_weight': 1, 'verbosity': 0, 'objective': 'binary:logitraw', 'nthread': 1, 'eval_metric': 'auc'}

xgb_params_space = {'eta': hp.uniform('eta', 0.0, 0.3), 'gamma': hp.uniform('gamma', 0.0, 0.5), 'max_depth': hp.quniform('max_depth', 2, 8, 1), 'min_child_weight': hp.quniform('min_child_weight', 1, 5, 1), 'subsample': 1, 'colsample_bytree': 1, 'colsample_bylevel': 1, 'colsample_bynode': 1, 'lambda': hp.uniform('lambda', 0.0, 1.0), 'alpha': hp.uniform('alpha', 0.0, 1.0), 'scale_pos_weight': 1.0, 'silent': 1, 'objective': 'binary:logitraw', 'nthread': 1, 'eval_metric': 'auc'}
xgb_params_space = {'eta': hp.uniform('eta', 0.0, 0.3), 'gamma': hp.uniform('gamma', 0.0, 0.5), 'max_depth': hp.quniform('max_depth', 2, 8, 1), 'min_child_weight': hp.quniform('min_child_weight', 1, 5, 1), 'subsample': 1, 'colsample_bytree': 1, 'colsample_bylevel': 1, 'colsample_bynode': 1, 'lambda': hp.uniform('lambda', 0.0, 1.0), 'alpha': hp.uniform('alpha', 0.0, 1.0), 'scale_pos_weight': 1.0, 'verbosity': 0, 'objective': 'binary:logitraw', 'nthread': 1, 'eval_metric': 'auc'}

if not apply_weights:
PyProphetLearner(infile, outfile, classifier, xgb_hyperparams, xgb_params, xgb_params_space, xeval_fraction, xeval_num_iter, ss_initial_fdr, ss_iteration_fdr, ss_num_iter, ss_main_score, group_id, parametric, pfdr, pi0_lambda, pi0_method, pi0_smooth_df, pi0_smooth_log_pi0, lfdr_truncate, lfdr_monotone, lfdr_transformation, lfdr_adj, lfdr_eps, level, ipf_max_peakgroup_rank, ipf_max_peakgroup_pep, ipf_max_transition_isotope_overlap, ipf_min_transition_sn, tric_chromprob, threads, test, ss_score_filter, color_palette).run()
PyProphetLearner(infile, outfile, classifier, xgb_hyperparams, xgb_params, xgb_params_space, xeval_fraction, xeval_num_iter, ss_initial_fdr, ss_iteration_fdr, ss_num_iter, ss_main_score, group_id, parametric, pfdr, pi0_lambda, pi0_method, pi0_smooth_df, pi0_smooth_log_pi0, lfdr_truncate, lfdr_monotone, lfdr_transformation, lfdr_adj, lfdr_eps, level, ipf_max_peakgroup_rank, ipf_max_peakgroup_pep, ipf_max_transition_isotope_overlap, ipf_min_transition_sn, tric_chromprob, threads, test, ss_score_filter, color_palette, main_score_selection_report).run()
else:

PyProphetWeightApplier(infile, outfile, classifier, xgb_hyperparams, xgb_params, xgb_params_space, xeval_fraction, xeval_num_iter, ss_initial_fdr, ss_iteration_fdr, ss_num_iter, ss_main_score, group_id, parametric, pfdr, pi0_lambda, pi0_method, pi0_smooth_df, pi0_smooth_log_pi0, lfdr_truncate, lfdr_monotone, lfdr_transformation, lfdr_adj, lfdr_eps, level, ipf_max_peakgroup_rank, ipf_max_peakgroup_pep, ipf_max_transition_isotope_overlap, ipf_min_transition_sn, tric_chromprob, threads, test, apply_weights, ss_score_filter, color_palette).run()


Expand Down
19 changes: 12 additions & 7 deletions pyprophet/pyprophet.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,11 +264,11 @@ def learn_and_apply(self, table):
def _learn_and_apply(self, table):

experiment, score_columns = self._setup_experiment(table)
final_classifier = self._learn(experiment)
final_classifier = self._learn(experiment, score_columns)

return self._build_result(table, final_classifier, score_columns, experiment)

def _learn(self, experiment):
def _learn(self, experiment, score_columns):
if self.test: # for reliable results
experiment.df.sort_values("tg_id", ascending=True, inplace=True)

Expand All @@ -285,7 +285,7 @@ def _learn(self, experiment):

if self.threads == 1:
for k in range(neval):
(ttt_scores, ttd_scores, w) = learner.learn_randomized(experiment)
(ttt_scores, ttd_scores, w) = learner.learn_randomized(experiment, score_columns, 1)
ttt.append(ttt_scores)
ttd.append(ttd_scores)
ws.append(w)
Expand All @@ -295,7 +295,12 @@ def _learn(self, experiment):
remaining = max(0, neval - self.threads)
todo = neval - remaining
neval -= todo
args = ((learner, "learn_randomized", (experiment, )), ) * todo
# args = ((learner, "learn_randomized", (experiment, score_columns, )), ) * todo
# Add individual worker ids
args = []
for thread_num in range(1, todo+1):
args.append((learner, "learn_randomized", (experiment, score_columns, thread_num, )))
args = tuple(args)
res = pool.map(unwrap_self_for_multiprocessing, args)
ttt_scores = [r[0] for r in res]
ttd_scores = [r[1] for r in res]
Expand Down Expand Up @@ -347,10 +352,10 @@ def _build_result(self, table, final_classifier, score_columns, experiment):


@profile
def PyProphet(classifier, xgb_hyperparams, xgb_params, xgb_params_space, xeval_fraction, xeval_num_iter, ss_initial_fdr, ss_iteration_fdr, ss_num_iter, group_id, parametric, pfdr, pi0_lambda, pi0_method, pi0_smooth_df, pi0_smooth_log_pi0, lfdr_truncate, lfdr_monotone, lfdr_transformation, lfdr_adj, lfdr_eps, tric_chromprob, threads, test, ss_score_filter, color_palette):
def PyProphet(classifier, xgb_hyperparams, xgb_params, xgb_params_space, xeval_fraction, xeval_num_iter, ss_initial_fdr, ss_iteration_fdr, ss_num_iter, group_id, parametric, pfdr, pi0_lambda, pi0_method, pi0_smooth_df, pi0_smooth_log_pi0, lfdr_truncate, lfdr_monotone, lfdr_transformation, lfdr_adj, lfdr_eps, tric_chromprob, threads, test, ss_score_filter, color_palette, main_score_selection_report, outfile, level):
if classifier == "LDA":
return HolyGostQuery(StandardSemiSupervisedLearner(LDALearner(), xeval_fraction, xeval_num_iter, ss_initial_fdr, ss_iteration_fdr, parametric, pfdr, pi0_lambda, pi0_method, pi0_smooth_df, pi0_smooth_log_pi0, test), classifier, ss_num_iter, group_id, parametric, pfdr, pi0_lambda, pi0_method, pi0_smooth_df, pi0_smooth_log_pi0, lfdr_truncate, lfdr_monotone, lfdr_transformation, lfdr_adj, lfdr_eps, tric_chromprob, threads, test, ss_score_filter, color_palette)
return HolyGostQuery(StandardSemiSupervisedLearner(LDALearner(), xeval_fraction, xeval_num_iter, ss_initial_fdr, ss_iteration_fdr, parametric, pfdr, pi0_lambda, pi0_method, pi0_smooth_df, pi0_smooth_log_pi0, test, main_score_selection_report, outfile, level), classifier, ss_num_iter, group_id, parametric, pfdr, pi0_lambda, pi0_method, pi0_smooth_df, pi0_smooth_log_pi0, lfdr_truncate, lfdr_monotone, lfdr_transformation, lfdr_adj, lfdr_eps, tric_chromprob, threads, test, ss_score_filter, color_palette)
elif classifier == "XGBoost":
return HolyGostQuery(StandardSemiSupervisedLearner(XGBLearner(xgb_hyperparams, xgb_params, xgb_params_space, threads), xeval_fraction, xeval_num_iter, ss_initial_fdr, ss_iteration_fdr, parametric, pfdr, pi0_lambda, pi0_method, pi0_smooth_df, pi0_smooth_log_pi0, test), classifier, ss_num_iter, group_id, parametric, pfdr, pi0_lambda, pi0_method, pi0_smooth_df, pi0_smooth_log_pi0, lfdr_truncate, lfdr_monotone, lfdr_transformation, lfdr_adj, lfdr_eps, tric_chromprob, threads, test, ss_score_filter, color_palette)
return HolyGostQuery(StandardSemiSupervisedLearner(XGBLearner(xgb_hyperparams, xgb_params, xgb_params_space, threads), xeval_fraction, xeval_num_iter, ss_initial_fdr, ss_iteration_fdr, parametric, pfdr, pi0_lambda, pi0_method, pi0_smooth_df, pi0_smooth_log_pi0, test, main_score_selection_report, outfile, level), classifier, ss_num_iter, group_id, parametric, pfdr, pi0_lambda, pi0_method, pi0_smooth_df, pi0_smooth_log_pi0, lfdr_truncate, lfdr_monotone, lfdr_transformation, lfdr_adj, lfdr_eps, tric_chromprob, threads, test, ss_score_filter, color_palette)
else:
raise click.ClickException("Classifier not supported.")
70 changes: 67 additions & 3 deletions pyprophet/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,18 @@
except ImportError:
plt = None

from PyPDF2 import PdfFileMerger, PdfFileReader

import os

import click
from scipy.stats import gaussian_kde
from numpy import linspace, concatenate, around

def color_blind_friendly(color_palette):

color_dict = {"normal":["#F5793A", "#0F2080"], "protan":["#AE9C45", "#052955"], "deutran":["#C59434", "#092C48"], "tritan":["#1ECBE1", "#E1341E"]}

if color_palette not in color_dict:
click.echo(f"WARN: {color_palette} is not a valid color_palette, must be one of 'normal'. 'protan', 'deutran', or 'tritan'. Using default 'normal'.")
color_palette = "normal"
Expand Down Expand Up @@ -142,8 +146,68 @@ def plot_scores(df, out, color_palette="normal"):
def plot_hist(x, title, xlabel, ylabel, pdf_path="histogram_plot.png"):

if plt is not None:
# Clear figures
plt.close('all')
counts, __, __ = plt.hist(x, bins=20, density=True)
plt.title(title, wrap=True)
plt.xlabel(xlabel)
plt.ylabel(ylabel)
plt.savefig(pdf_path)
plt.savefig(pdf_path)

# Clear figures
plt.close('all')

def main_score_selection_report(title, sel_column, mapper, decoy_scores, target_scores, target_pvalues, pi0, color_palette="normal", pdf_path="main_score_selection_report.pdf", worker_num=1):

if plt is None:
raise ImportError("Error: The matplotlib package is required to create a report.")

# Create output to merge pdges
output = PdfFileMerger()
# Generate colors
t_col, d_col = color_blind_friendly(color_palette)

# Clear figures
plt.close('all')

plt.figure(figsize=(10, 11))
plt.subplots_adjust(hspace=.5)

# Plot Score Distribution
plt.subplot(121)
plt.hist([target_scores, decoy_scores], 20, color=[t_col, d_col], label=['target', 'decoy'], histtype='bar')
plt.title(f"histogram of scores")
plt.xlabel("score")
plt.ylabel("density histogram")
plt.legend(loc=1)
# Plot P-value Distribution
plt.subplot(122)
if target_pvalues is not None:
counts, __, __ = plt.hist(target_pvalues, bins=20, density=True)
if pi0 is not None:
plt.plot([0, 1], [pi0['pi0'], pi0['pi0']], "r")
plt.title("p-value density histogram: $\pi_0$ = " + str(around(pi0['pi0'], decimals=3)))
else:
plt.title("p-value density histogram: $\pi_0$ estimation failed.")
plt.xlabel("target p-values")
plt.ylabel("density histogram")
# Finalize figure
plt.suptitle(f"{title}: {mapper[sel_column]}")
plt.tight_layout()
# Append to existing master report if exists, otherwise write to a new master report pdf.
if os.path.isfile(pdf_path):
temp_pdf_path = f"temp_main_score_selection_report_thread__{worker_num}.pdf"
# Save current plot in temporary pdf
plt.savefig(temp_pdf_path)
# Append master pdf and temp pdf to output merger
output.append(PdfFileReader(open(pdf_path, "rb")))
output.append(PdfFileReader(open(temp_pdf_path, "rb")))
# Write to master pdf
output.write(pdf_path)
# Remove temporary pdf
os.remove(temp_pdf_path)
else:
# Save as master pdf
plt.savefig(pdf_path)
# Clear figures
plt.close('all')
5 changes: 3 additions & 2 deletions pyprophet/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class PyProphetRunner(object):
"""Base class for workflow of command line tool
"""

def __init__(self, infile, outfile, classifier, xgb_hyperparams, xgb_params, xgb_params_space, xeval_fraction, xeval_num_iter, ss_initial_fdr, ss_iteration_fdr, ss_num_iter, ss_main_score, group_id, parametric, pfdr, pi0_lambda, pi0_method, pi0_smooth_df, pi0_smooth_log_pi0, lfdr_truncate, lfdr_monotone, lfdr_transformation, lfdr_adj, lfdr_eps, level, ipf_max_peakgroup_rank, ipf_max_peakgroup_pep, ipf_max_transition_isotope_overlap, ipf_min_transition_sn, tric_chromprob, threads, test, ss_score_filter, color_palette):
def __init__(self, infile, outfile, classifier, xgb_hyperparams, xgb_params, xgb_params_space, xeval_fraction, xeval_num_iter, ss_initial_fdr, ss_iteration_fdr, ss_num_iter, ss_main_score, group_id, parametric, pfdr, pi0_lambda, pi0_method, pi0_smooth_df, pi0_smooth_log_pi0, lfdr_truncate, lfdr_monotone, lfdr_transformation, lfdr_adj, lfdr_eps, level, ipf_max_peakgroup_rank, ipf_max_peakgroup_pep, ipf_max_transition_isotope_overlap, ipf_min_transition_sn, tric_chromprob, threads, test, ss_score_filter, color_palette, main_score_selection_report):
def read_tsv(infile):
table = pd.read_csv(infile, "\t")
return(table)
Expand Down Expand Up @@ -232,6 +232,7 @@ def read_osw(infile, level, ipf_max_peakgroup_rank, ipf_max_peakgroup_pep, ipf_m
self.test = test
self.ss_score_filter = ss_score_filter
self.color_palette = color_palette
self.main_score_selection_report = main_score_selection_report

self.prefix = os.path.splitext(outfile)[0]

Expand Down Expand Up @@ -414,7 +415,7 @@ def save_bin_weights(self, weights, extra_writes):
class PyProphetLearner(PyProphetRunner):

def run_algo(self):
(result, scorer, weights) = PyProphet(self.classifier, self.xgb_hyperparams, self.xgb_params, self.xgb_params_space, self.xeval_fraction, self.xeval_num_iter, self.ss_initial_fdr, self.ss_iteration_fdr, self.ss_num_iter, self.group_id, self.parametric, self.pfdr, self.pi0_lambda, self.pi0_method, self.pi0_smooth_df, self.pi0_smooth_log_pi0, self.lfdr_truncate, self.lfdr_monotone, self.lfdr_transformation, self.lfdr_adj, self.lfdr_eps, self.tric_chromprob, self.threads, self.test, self.ss_score_filter, self.color_palette).learn_and_apply(self.table)
(result, scorer, weights) = PyProphet(self.classifier, self.xgb_hyperparams, self.xgb_params, self.xgb_params_space, self.xeval_fraction, self.xeval_num_iter, self.ss_initial_fdr, self.ss_iteration_fdr, self.ss_num_iter, self.group_id, self.parametric, self.pfdr, self.pi0_lambda, self.pi0_method, self.pi0_smooth_df, self.pi0_smooth_log_pi0, self.lfdr_truncate, self.lfdr_monotone, self.lfdr_transformation, self.lfdr_adj, self.lfdr_eps, self.tric_chromprob, self.threads, self.test, self.ss_score_filter, self.color_palette, self.main_score_selection_report, self.outfile, self.level).learn_and_apply(self.table)
return (result, scorer, weights)

def extra_writes(self):
Expand Down

0 comments on commit 265fca4

Please sign in to comment.