From fb709c0a9d7008e593a356a8313a53ab0315ed12 Mon Sep 17 00:00:00 2001 From: Yannick Mahlich Date: Fri, 30 May 2025 15:48:46 -0700 Subject: [PATCH 1/3] added proteomics return --- scripts/prepare_data_for_improve.py | 77 ++++++++++++++++++++++++++++- 1 file changed, 76 insertions(+), 1 deletion(-) diff --git a/scripts/prepare_data_for_improve.py b/scripts/prepare_data_for_improve.py index fb1ed692..336b2012 100644 --- a/scripts/prepare_data_for_improve.py +++ b/scripts/prepare_data_for_improve.py @@ -326,7 +326,82 @@ def process_datasets(args): ) ) + #------------------------------------------------------------------- + # create proteomics master table + #------------------------------------------------------------------- + + proteomics = merge_master_tables( + args=args, + data_sets=data_sets, + data_type='proteomics' + ) + + #### + # Imputation step: + # currently we are imputing by generating the mean over all samples + # in wich the protein was detected across all datasets. + # The missing values are the back filled for each protein. + #### + proteomics = ( + proteomics + # the proteomics table has the transposed first (see below) + # due to .fillna not working as expected with axis==1 + .T + .fillna( + # the filling of NAs with 'value' is not implemented for + # axis==1, despite what is documented for pandas>2.0.0 + value=proteomics.median(axis=1, skipna=True), + axis=0 + ) + .T # transpose back into original orientation + ) + # merging ensemble gene id & gene symbol into the proteomics + # data + proteomics = pd.merge( + proteomics, + data_gene_names[[ + 'entrez_id', + 'ensembl_gene_id', + 'gene_symbol' + ]], + how='left', + on='entrez_id', + ) + + # moving ensemble_id & gene_symbol columns to the front of the table + # such that when transposing the DataFrame they are row 3 and 2 + # respectively + proteomics.insert( + 1, + 'gene_symbol', + proteomics.pop('gene_symbol') + ) + proteomics.insert( + 0, + 'ensembl_gene_id', + proteomics.pop('ensembl_gene_id') + ) + + proteomics = proteomics[proteomics['entrez_id'] != 0] + proteomics = proteomics.fillna(0).T.reset_index() + for i in range(0,3): + proteomics.iloc[i,0] = np.nan + # writing the proteomics datatable to '/x_data/*_proteomics.tsv' + outfile_path = args.WORKDIR.joinpath( + "data_out", + "x_data", + "cancer_proteomics.tsv" + ) + (proteomics + .to_csv( + path_or_buf=outfile_path, + sep='\t', + header=False, + index=False + ) + ) + #------------------------------------------------------------------- # create copynumber master table & discretized table #------------------------------------------------------------------- @@ -869,7 +944,7 @@ def merge_master_tables(args, data_sets, data_type: str='transcriptomics'): for data_set in data_sets: if data_sets[data_set].experiments is not None: if ( - data_type in ['transcriptomics', 'copy_number'] and + data_type in ['transcriptomics', 'copy_number', 'proteomics'] and getattr(data_sets[data_set], data_type, None) is not None ): dfs_to_merge.append( From c0e3c0396963b4df23ea934b3dad09b8050f9503 Mon Sep 17 00:00:00 2001 From: Yannick Mahlich Date: Thu, 14 Aug 2025 11:40:11 -0700 Subject: [PATCH 2/3] added basic blancing logic - currently 4 evenly spaced bins --- scripts/prepare_data_for_improve.py | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/scripts/prepare_data_for_improve.py b/scripts/prepare_data_for_improve.py index 336b2012..ad43d981 100644 --- a/scripts/prepare_data_for_improve.py +++ b/scripts/prepare_data_for_improve.py @@ -82,6 +82,13 @@ def main(): type=int, default=10 ) + p_process_datasets.add_argument( + '-b', '--balance_by', dest='BALANCE_BY', + choices=['auc', 'fit_auc'], + default=None, + help="Defines if and using which drug response metric the splits " + "should be balanced by." + ) p_process_datasets.add_argument( '-r', '--random_seeds', dest='RANDOM_SEEDS', type=_random_seed_list, @@ -166,7 +173,7 @@ def process_datasets(args): logger.debug("creating list of datasets that contain experiment info ...") for data_set in data_sets_names_list: # sarcpdo has different drug response values - if data_set == 'sarcpdo': + if data_set == 'sarcpdo' and data_sets[data_set].experiments is not None: experiment = data_sets[data_set].format( data_type='experiments', shape='wide', @@ -763,13 +770,21 @@ def split_data_sets( args: dict, data_sets: dict, data_sets_names: list, - response_data: pd.DataFrame + response_data: pd.DataFrame, ): splits_folder = args.WORKDIR.joinpath('data_out', 'splits') split_type = args.SPLIT_TYPE ratio = (8,1,1) - stratify_by = None + stratify_by = args.BALANCE_BY + if stratify_by is not None: + balance = True + quantiles = False + num_classes = 4 + else: + balance = False + quantiles = True + num_classes = 4 if args.RANDOM_SEEDS is not None: random_seeds = args.RANDOM_SEEDS else: @@ -818,6 +833,9 @@ def split_data_sets( split_type=split_type, ratio=ratio, stratify_by=stratify_by, + balance=balance, + quantiles=quantiles, + num_classes=num_classes, random_state=random_seeds[i] ) train_keys = ( From aae916095d578f692fd78b42f42e5afee5159ffb Mon Sep 17 00:00:00 2001 From: Yannick Mahlich Date: Mon, 18 Aug 2025 16:00:42 -0700 Subject: [PATCH 3/3] added logic to convert from mRE(S)CIST to auc --- scripts/prepare_data_for_improve.py | 39 ++++++++++++++++++++++++----- 1 file changed, 33 insertions(+), 6 deletions(-) diff --git a/scripts/prepare_data_for_improve.py b/scripts/prepare_data_for_improve.py index ad43d981..553aecfe 100644 --- a/scripts/prepare_data_for_improve.py +++ b/scripts/prepare_data_for_improve.py @@ -172,8 +172,16 @@ def process_datasets(args): experiments = [] logger.debug("creating list of datasets that contain experiment info ...") for data_set in data_sets_names_list: - # sarcpdo has different drug response values - if data_set == 'sarcpdo' and data_sets[data_set].experiments is not None: + experiments_raw = data_sets[data_set].experiments + + # Some datasets don't have drug response data (the experiments + # table) + if experiments_raw is None: + logger.debug(f"NO experiment data for {data_set}") + + + # Logic for datasets containing "published_auc" but not "auc" + elif experiments_raw['dose_response_metric'].isin(['published_auc']).any(): experiment = data_sets[data_set].format( data_type='experiments', shape='wide', @@ -183,8 +191,29 @@ def process_datasets(args): ) experiment.rename(columns={'published_auc': 'auc'}, inplace=True) experiments.append(experiment) - # not all Datasets have experiments / drug response data - elif data_sets[data_set].experiments is not None: + + # Logic for PDX datasets that don't have `auc` but mRECIST (note + # the typo currently in the `drugresponse_metric` column). + elif experiments_raw['dose_response_metric'].isin(['mRESCIST']).any(): + experiment = data_sets[data_set].format( + data_type='experiments', + shape='wide', + metrics=[ + 'mRESCIST', + ], + ) + # conversion logic from mRECIST -> auc + experiment.loc[experiment['mRESCIST'] == 'CR', 'mRESCIST'] = 0.1 + experiment.loc[experiment['mRESCIST'] == 'PR', 'mRESCIST'] = 0.2 + experiment.loc[experiment['mRESCIST'] == 'SD', 'mRESCIST'] = 0.5 + experiment.loc[experiment['mRESCIST'] == 'PD', 'mRESCIST'] = 1.0 + + experiment.rename(columns={'mRESCIST': 'auc'}, inplace=True) + experiments.append(experiment) + + # The remaining datasets should have `auc` as + # drug_response_metric available in the `experiments` table + else: logger.debug(f"experiment data found for {data_set}") # formatting existing response data to wide experiment = data_sets[data_set].format( @@ -203,8 +232,6 @@ def process_datasets(args): ], ) experiments.append(experiment.dropna()) - else: - logger.debug(f"NO experiment data for {data_set}") # concatenating existing response data and "clean up" logger.debug("concatenating experiment data ...")