Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/dynamic main score #110

Merged
merged 16 commits into from
Dec 15, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 3 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,6 @@ WORKDIR /pyprophet
RUN python setup.py install
WORKDIR /
RUN rm -rf /pyprophet

# Set final working directory, useful for when binding to a local mount
WORKDIR /data/
2 changes: 1 addition & 1 deletion pyprophet/classifiers.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ def objective(params):
'scale_pos_weight': "{:.3f}".format(params['scale_pos_weight']),
}

clf = xgb.XGBClassifier(random_state=42, silent=1, objective='binary:logitraw', eval_metric='auc', **params)
clf = xgb.XGBClassifier(random_state=42, verbosity=0, objective='binary:logitraw', eval_metric='auc', **params)

score = cross_val_score(clf, X, y, scoring='roc_auc', n_jobs=self.threads, cv=KFold(n_splits=3, shuffle=True, random_state=np.random.RandomState(42))).mean()
# click.echo("Info: AUC: {:.3f} hyperparameters: {}".format(score, params))
Expand Down
27 changes: 26 additions & 1 deletion pyprophet/data_handling.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,32 @@ def prepare_data_table(table,
df = cleanup_and_check(df)
return df, all_score_columns


def update_chosen_main_score_in_table(train, score_columns, use_as_main_score):
"""
Update feature tables main_score
"""
# Get current main score column name
old_main_score_column = [col for col in score_columns if 'main' in col][0]
# Get tables aliased score variable name
df_column_score_alias = [col for col in train.df.columns if col not in ['tg_id', 'tg_num_id', 'is_decoy', 'is_top_peak', 'is_train', 'classifier_score']]
# Generate mapping to rename columns in table
mapper = {alias_col : col for alias_col, col in zip(df_column_score_alias, score_columns)}
# Rename columns with actual feature score names
train.df.rename(columns=mapper, inplace=True)
# Update coulmns to set new main score column based on most important feature column
updated_score_columns = [col.replace("main_", "") if col==old_main_score_column else col for col in score_columns]
updated_score_columns = [col.replace("var", "main_var") if col==use_as_main_score else col for col in updated_score_columns]
updated_score_columns = sorted(updated_score_columns, key=lambda x:(x!=use_as_main_score.replace("var", "main_var"), x))
updated_score_columns = [old_main_score_column if old_main_score_column.replace("main_", "")==col else col for col in updated_score_columns]
# Rename columns with feature aliases
mapper = {v : 'var_{0}'.format(i) for i, v in enumerate(updated_score_columns[1:len(updated_score_columns)])}
mapper[updated_score_columns[0].replace("main_", "")] = 'main_score'
train.df.rename(columns=mapper, inplace=True)
# Re-order main_score column index
temp_col = train.df.pop('main_score')
train.df.insert(5, temp_col.name, temp_col)
click.echo(f"Info: Updated main score column from {old_main_score_column} to {use_as_main_score}...")
return train, tuple(updated_score_columns)
class Experiment(object):

@profile
Expand Down
10 changes: 6 additions & 4 deletions pyprophet/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,10 +73,11 @@ def cli():
@click.option('--tric_chromprob/--no-tric_chromprob', default=False, show_default=True, help='Whether chromatogram probabilities for TRIC should be computed.')
# Visualization
@click.option('--color_palette', default='normal', show_default=True, type=click.Choice(['normal', 'protan', 'deutran', 'tritan']), help='Color palette to use in reports.')
@click.option('--main_score_selection_report/--no-main_score_selection_report', default=False, show_default=True, help='Generate a report for main score selection process.')
# Processing
@click.option('--threads', default=1, show_default=True, type=int, help='Number of threads used for semi-supervised learning. -1 means all available CPUs.', callback=transform_threads)
@click.option('--test/--no-test', default=False, show_default=True, help='Run in test mode with fixed seed.')
def score(infile, outfile, classifier, xgb_autotune, apply_weights, xeval_fraction, xeval_num_iter, ss_initial_fdr, ss_iteration_fdr, ss_num_iter, ss_main_score, group_id, parametric, pfdr, pi0_lambda, pi0_method, pi0_smooth_df, pi0_smooth_log_pi0, lfdr_truncate, lfdr_monotone, lfdr_transformation, lfdr_adj, lfdr_eps, level, ipf_max_peakgroup_rank, ipf_max_peakgroup_pep, ipf_max_transition_isotope_overlap, ipf_min_transition_sn, tric_chromprob, threads, test, ss_score_filter, color_palette):
def score(infile, outfile, classifier, xgb_autotune, apply_weights, xeval_fraction, xeval_num_iter, ss_initial_fdr, ss_iteration_fdr, ss_num_iter, ss_main_score, group_id, parametric, pfdr, pi0_lambda, pi0_method, pi0_smooth_df, pi0_smooth_log_pi0, lfdr_truncate, lfdr_monotone, lfdr_transformation, lfdr_adj, lfdr_eps, level, ipf_max_peakgroup_rank, ipf_max_peakgroup_pep, ipf_max_transition_isotope_overlap, ipf_min_transition_sn, tric_chromprob, threads, test, ss_score_filter, color_palette, main_score_selection_report):
"""
Conduct semi-supervised learning and error-rate estimation for MS1, MS2 and transition-level data.
"""
Expand All @@ -89,13 +90,14 @@ def score(infile, outfile, classifier, xgb_autotune, apply_weights, xeval_fracti
# Prepare XGBoost-specific parameters
xgb_hyperparams = {'autotune': xgb_autotune, 'autotune_num_rounds': 10, 'num_boost_round': 100, 'early_stopping_rounds': 10, 'test_size': 0.33}

xgb_params = {'eta': 0.3, 'gamma': 0, 'max_depth': 6, 'min_child_weight': 1, 'subsample': 1, 'colsample_bytree': 1, 'colsample_bylevel': 1, 'colsample_bynode': 1, 'lambda': 1, 'alpha': 0, 'scale_pos_weight': 1, 'silent': 1, 'objective': 'binary:logitraw', 'nthread': 1, 'eval_metric': 'auc'}
xgb_params = {'eta': 0.3, 'gamma': 0, 'max_depth': 6, 'min_child_weight': 1, 'subsample': 1, 'colsample_bytree': 1, 'colsample_bylevel': 1, 'colsample_bynode': 1, 'lambda': 1, 'alpha': 0, 'scale_pos_weight': 1, 'verbosity': 0, 'objective': 'binary:logitraw', 'nthread': 1, 'eval_metric': 'auc'}

xgb_params_space = {'eta': hp.uniform('eta', 0.0, 0.3), 'gamma': hp.uniform('gamma', 0.0, 0.5), 'max_depth': hp.quniform('max_depth', 2, 8, 1), 'min_child_weight': hp.quniform('min_child_weight', 1, 5, 1), 'subsample': 1, 'colsample_bytree': 1, 'colsample_bylevel': 1, 'colsample_bynode': 1, 'lambda': hp.uniform('lambda', 0.0, 1.0), 'alpha': hp.uniform('alpha', 0.0, 1.0), 'scale_pos_weight': 1.0, 'silent': 1, 'objective': 'binary:logitraw', 'nthread': 1, 'eval_metric': 'auc'}
xgb_params_space = {'eta': hp.uniform('eta', 0.0, 0.3), 'gamma': hp.uniform('gamma', 0.0, 0.5), 'max_depth': hp.quniform('max_depth', 2, 8, 1), 'min_child_weight': hp.quniform('min_child_weight', 1, 5, 1), 'subsample': 1, 'colsample_bytree': 1, 'colsample_bylevel': 1, 'colsample_bynode': 1, 'lambda': hp.uniform('lambda', 0.0, 1.0), 'alpha': hp.uniform('alpha', 0.0, 1.0), 'scale_pos_weight': 1.0, 'verbosity': 0, 'objective': 'binary:logitraw', 'nthread': 1, 'eval_metric': 'auc'}

if not apply_weights:
PyProphetLearner(infile, outfile, classifier, xgb_hyperparams, xgb_params, xgb_params_space, xeval_fraction, xeval_num_iter, ss_initial_fdr, ss_iteration_fdr, ss_num_iter, ss_main_score, group_id, parametric, pfdr, pi0_lambda, pi0_method, pi0_smooth_df, pi0_smooth_log_pi0, lfdr_truncate, lfdr_monotone, lfdr_transformation, lfdr_adj, lfdr_eps, level, ipf_max_peakgroup_rank, ipf_max_peakgroup_pep, ipf_max_transition_isotope_overlap, ipf_min_transition_sn, tric_chromprob, threads, test, ss_score_filter, color_palette).run()
PyProphetLearner(infile, outfile, classifier, xgb_hyperparams, xgb_params, xgb_params_space, xeval_fraction, xeval_num_iter, ss_initial_fdr, ss_iteration_fdr, ss_num_iter, ss_main_score, group_id, parametric, pfdr, pi0_lambda, pi0_method, pi0_smooth_df, pi0_smooth_log_pi0, lfdr_truncate, lfdr_monotone, lfdr_transformation, lfdr_adj, lfdr_eps, level, ipf_max_peakgroup_rank, ipf_max_peakgroup_pep, ipf_max_transition_isotope_overlap, ipf_min_transition_sn, tric_chromprob, threads, test, ss_score_filter, color_palette, main_score_selection_report).run()
else:

PyProphetWeightApplier(infile, outfile, classifier, xgb_hyperparams, xgb_params, xgb_params_space, xeval_fraction, xeval_num_iter, ss_initial_fdr, ss_iteration_fdr, ss_num_iter, ss_main_score, group_id, parametric, pfdr, pi0_lambda, pi0_method, pi0_smooth_df, pi0_smooth_log_pi0, lfdr_truncate, lfdr_monotone, lfdr_transformation, lfdr_adj, lfdr_eps, level, ipf_max_peakgroup_rank, ipf_max_peakgroup_pep, ipf_max_transition_isotope_overlap, ipf_min_transition_sn, tric_chromprob, threads, test, apply_weights, ss_score_filter, color_palette).run()


Expand Down
19 changes: 12 additions & 7 deletions pyprophet/pyprophet.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,11 +264,11 @@ def learn_and_apply(self, table):
def _learn_and_apply(self, table):

experiment, score_columns = self._setup_experiment(table)
final_classifier = self._learn(experiment)
final_classifier = self._learn(experiment, score_columns)

return self._build_result(table, final_classifier, score_columns, experiment)

def _learn(self, experiment):
def _learn(self, experiment, score_columns):
if self.test: # for reliable results
experiment.df.sort_values("tg_id", ascending=True, inplace=True)

Expand All @@ -285,7 +285,7 @@ def _learn(self, experiment):

if self.threads == 1:
for k in range(neval):
(ttt_scores, ttd_scores, w) = learner.learn_randomized(experiment)
(ttt_scores, ttd_scores, w) = learner.learn_randomized(experiment, score_columns, 1)
ttt.append(ttt_scores)
ttd.append(ttd_scores)
ws.append(w)
Expand All @@ -295,7 +295,12 @@ def _learn(self, experiment):
remaining = max(0, neval - self.threads)
todo = neval - remaining
neval -= todo
args = ((learner, "learn_randomized", (experiment, )), ) * todo
# args = ((learner, "learn_randomized", (experiment, score_columns, )), ) * todo
# Add individual worker ids
args = []
for thread_num in range(1, todo+1):
args.append((learner, "learn_randomized", (experiment, score_columns, thread_num, )))
args = tuple(args)
res = pool.map(unwrap_self_for_multiprocessing, args)
ttt_scores = [r[0] for r in res]
ttd_scores = [r[1] for r in res]
Expand Down Expand Up @@ -347,10 +352,10 @@ def _build_result(self, table, final_classifier, score_columns, experiment):


@profile
def PyProphet(classifier, xgb_hyperparams, xgb_params, xgb_params_space, xeval_fraction, xeval_num_iter, ss_initial_fdr, ss_iteration_fdr, ss_num_iter, group_id, parametric, pfdr, pi0_lambda, pi0_method, pi0_smooth_df, pi0_smooth_log_pi0, lfdr_truncate, lfdr_monotone, lfdr_transformation, lfdr_adj, lfdr_eps, tric_chromprob, threads, test, ss_score_filter, color_palette):
def PyProphet(classifier, xgb_hyperparams, xgb_params, xgb_params_space, xeval_fraction, xeval_num_iter, ss_initial_fdr, ss_iteration_fdr, ss_num_iter, group_id, parametric, pfdr, pi0_lambda, pi0_method, pi0_smooth_df, pi0_smooth_log_pi0, lfdr_truncate, lfdr_monotone, lfdr_transformation, lfdr_adj, lfdr_eps, tric_chromprob, threads, test, ss_score_filter, color_palette, main_score_selection_report, outfile, level):
if classifier == "LDA":
return HolyGostQuery(StandardSemiSupervisedLearner(LDALearner(), xeval_fraction, xeval_num_iter, ss_initial_fdr, ss_iteration_fdr, parametric, pfdr, pi0_lambda, pi0_method, pi0_smooth_df, pi0_smooth_log_pi0, test), classifier, ss_num_iter, group_id, parametric, pfdr, pi0_lambda, pi0_method, pi0_smooth_df, pi0_smooth_log_pi0, lfdr_truncate, lfdr_monotone, lfdr_transformation, lfdr_adj, lfdr_eps, tric_chromprob, threads, test, ss_score_filter, color_palette)
return HolyGostQuery(StandardSemiSupervisedLearner(LDALearner(), xeval_fraction, xeval_num_iter, ss_initial_fdr, ss_iteration_fdr, parametric, pfdr, pi0_lambda, pi0_method, pi0_smooth_df, pi0_smooth_log_pi0, test, main_score_selection_report, outfile, level), classifier, ss_num_iter, group_id, parametric, pfdr, pi0_lambda, pi0_method, pi0_smooth_df, pi0_smooth_log_pi0, lfdr_truncate, lfdr_monotone, lfdr_transformation, lfdr_adj, lfdr_eps, tric_chromprob, threads, test, ss_score_filter, color_palette)
elif classifier == "XGBoost":
return HolyGostQuery(StandardSemiSupervisedLearner(XGBLearner(xgb_hyperparams, xgb_params, xgb_params_space, threads), xeval_fraction, xeval_num_iter, ss_initial_fdr, ss_iteration_fdr, parametric, pfdr, pi0_lambda, pi0_method, pi0_smooth_df, pi0_smooth_log_pi0, test), classifier, ss_num_iter, group_id, parametric, pfdr, pi0_lambda, pi0_method, pi0_smooth_df, pi0_smooth_log_pi0, lfdr_truncate, lfdr_monotone, lfdr_transformation, lfdr_adj, lfdr_eps, tric_chromprob, threads, test, ss_score_filter, color_palette)
return HolyGostQuery(StandardSemiSupervisedLearner(XGBLearner(xgb_hyperparams, xgb_params, xgb_params_space, threads), xeval_fraction, xeval_num_iter, ss_initial_fdr, ss_iteration_fdr, parametric, pfdr, pi0_lambda, pi0_method, pi0_smooth_df, pi0_smooth_log_pi0, test, main_score_selection_report, outfile, level), classifier, ss_num_iter, group_id, parametric, pfdr, pi0_lambda, pi0_method, pi0_smooth_df, pi0_smooth_log_pi0, lfdr_truncate, lfdr_monotone, lfdr_transformation, lfdr_adj, lfdr_eps, tric_chromprob, threads, test, ss_score_filter, color_palette)
else:
raise click.ClickException("Classifier not supported.")
70 changes: 67 additions & 3 deletions pyprophet/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,18 @@
except ImportError:
plt = None

from PyPDF2 import PdfFileMerger, PdfFileReader

import os

import click
from scipy.stats import gaussian_kde
from numpy import linspace, concatenate, around

def color_blind_friendly(color_palette):

color_dict = {"normal":["#F5793A", "#0F2080"], "protan":["#AE9C45", "#052955"], "deutran":["#C59434", "#092C48"], "tritan":["#1ECBE1", "#E1341E"]}

if color_palette not in color_dict:
click.echo(f"WARN: {color_palette} is not a valid color_palette, must be one of 'normal'. 'protan', 'deutran', or 'tritan'. Using default 'normal'.")
color_palette = "normal"
Expand Down Expand Up @@ -142,8 +146,68 @@ def plot_scores(df, out, color_palette="normal"):
def plot_hist(x, title, xlabel, ylabel, pdf_path="histogram_plot.png"):

if plt is not None:
# Clear figures
plt.close('all')
counts, __, __ = plt.hist(x, bins=20, density=True)
plt.title(title, wrap=True)
plt.xlabel(xlabel)
plt.ylabel(ylabel)
plt.savefig(pdf_path)
plt.savefig(pdf_path)

# Clear figures
plt.close('all')

def main_score_selection_report(title, sel_column, mapper, decoy_scores, target_scores, target_pvalues, pi0, color_palette="normal", pdf_path="main_score_selection_report.pdf", worker_num=1):

if plt is None:
raise ImportError("Error: The matplotlib package is required to create a report.")

# Create output to merge pdges
output = PdfFileMerger()
# Generate colors
t_col, d_col = color_blind_friendly(color_palette)

# Clear figures
plt.close('all')

plt.figure(figsize=(10, 11))
plt.subplots_adjust(hspace=.5)

# Plot Score Distribution
plt.subplot(121)
plt.hist([target_scores, decoy_scores], 20, color=[t_col, d_col], label=['target', 'decoy'], histtype='bar')
plt.title(f"histogram of scores")
plt.xlabel("score")
plt.ylabel("density histogram")
plt.legend(loc=1)
# Plot P-value Distribution
plt.subplot(122)
if target_pvalues is not None:
counts, __, __ = plt.hist(target_pvalues, bins=20, density=True)
if pi0 is not None:
plt.plot([0, 1], [pi0['pi0'], pi0['pi0']], "r")
plt.title("p-value density histogram: $\pi_0$ = " + str(around(pi0['pi0'], decimals=3)))
else:
plt.title("p-value density histogram: $\pi_0$ estimation failed.")
plt.xlabel("target p-values")
plt.ylabel("density histogram")
# Finalize figure
plt.suptitle(f"{title}: {mapper[sel_column]}")
plt.tight_layout()
# Append to existing master report if exists, otherwise write to a new master report pdf.
if os.path.isfile(pdf_path):
temp_pdf_path = f"temp_main_score_selection_report_thread__{worker_num}.pdf"
# Save current plot in temporary pdf
plt.savefig(temp_pdf_path)
# Append master pdf and temp pdf to output merger
output.append(PdfFileReader(open(pdf_path, "rb")))
output.append(PdfFileReader(open(temp_pdf_path, "rb")))
# Write to master pdf
output.write(pdf_path)
# Remove temporary pdf
os.remove(temp_pdf_path)
else:
# Save as master pdf
plt.savefig(pdf_path)
# Clear figures
plt.close('all')
5 changes: 3 additions & 2 deletions pyprophet/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class PyProphetRunner(object):
"""Base class for workflow of command line tool
"""

def __init__(self, infile, outfile, classifier, xgb_hyperparams, xgb_params, xgb_params_space, xeval_fraction, xeval_num_iter, ss_initial_fdr, ss_iteration_fdr, ss_num_iter, ss_main_score, group_id, parametric, pfdr, pi0_lambda, pi0_method, pi0_smooth_df, pi0_smooth_log_pi0, lfdr_truncate, lfdr_monotone, lfdr_transformation, lfdr_adj, lfdr_eps, level, ipf_max_peakgroup_rank, ipf_max_peakgroup_pep, ipf_max_transition_isotope_overlap, ipf_min_transition_sn, tric_chromprob, threads, test, ss_score_filter, color_palette):
def __init__(self, infile, outfile, classifier, xgb_hyperparams, xgb_params, xgb_params_space, xeval_fraction, xeval_num_iter, ss_initial_fdr, ss_iteration_fdr, ss_num_iter, ss_main_score, group_id, parametric, pfdr, pi0_lambda, pi0_method, pi0_smooth_df, pi0_smooth_log_pi0, lfdr_truncate, lfdr_monotone, lfdr_transformation, lfdr_adj, lfdr_eps, level, ipf_max_peakgroup_rank, ipf_max_peakgroup_pep, ipf_max_transition_isotope_overlap, ipf_min_transition_sn, tric_chromprob, threads, test, ss_score_filter, color_palette, main_score_selection_report):
def read_tsv(infile):
table = pd.read_csv(infile, "\t")
return(table)
Expand Down Expand Up @@ -232,6 +232,7 @@ def read_osw(infile, level, ipf_max_peakgroup_rank, ipf_max_peakgroup_pep, ipf_m
self.test = test
self.ss_score_filter = ss_score_filter
self.color_palette = color_palette
self.main_score_selection_report = main_score_selection_report

self.prefix = os.path.splitext(outfile)[0]

Expand Down Expand Up @@ -414,7 +415,7 @@ def save_bin_weights(self, weights, extra_writes):
class PyProphetLearner(PyProphetRunner):

def run_algo(self):
(result, scorer, weights) = PyProphet(self.classifier, self.xgb_hyperparams, self.xgb_params, self.xgb_params_space, self.xeval_fraction, self.xeval_num_iter, self.ss_initial_fdr, self.ss_iteration_fdr, self.ss_num_iter, self.group_id, self.parametric, self.pfdr, self.pi0_lambda, self.pi0_method, self.pi0_smooth_df, self.pi0_smooth_log_pi0, self.lfdr_truncate, self.lfdr_monotone, self.lfdr_transformation, self.lfdr_adj, self.lfdr_eps, self.tric_chromprob, self.threads, self.test, self.ss_score_filter, self.color_palette).learn_and_apply(self.table)
(result, scorer, weights) = PyProphet(self.classifier, self.xgb_hyperparams, self.xgb_params, self.xgb_params_space, self.xeval_fraction, self.xeval_num_iter, self.ss_initial_fdr, self.ss_iteration_fdr, self.ss_num_iter, self.group_id, self.parametric, self.pfdr, self.pi0_lambda, self.pi0_method, self.pi0_smooth_df, self.pi0_smooth_log_pi0, self.lfdr_truncate, self.lfdr_monotone, self.lfdr_transformation, self.lfdr_adj, self.lfdr_eps, self.tric_chromprob, self.threads, self.test, self.ss_score_filter, self.color_palette, self.main_score_selection_report, self.outfile, self.level).learn_and_apply(self.table)
return (result, scorer, weights)

def extra_writes(self):
Expand Down