Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
138 changes: 129 additions & 9 deletions scripts/prepare_data_for_improve.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,13 @@ def main():
type=int,
default=10
)
p_process_datasets.add_argument(
'-b', '--balance_by', dest='BALANCE_BY',
choices=['auc', 'fit_auc'],
default=None,
help="Defines if and using which drug response metric the splits "
"should be balanced by."
)
p_process_datasets.add_argument(
'-r', '--random_seeds', dest='RANDOM_SEEDS',
type=_random_seed_list,
Expand Down Expand Up @@ -165,8 +172,16 @@ def process_datasets(args):
experiments = []
logger.debug("creating list of datasets that contain experiment info ...")
for data_set in data_sets_names_list:
# sarcpdo has different drug response values
if data_set == 'sarcpdo':
experiments_raw = data_sets[data_set].experiments

# Some datasets don't have drug response data (the experiments
# table)
if experiments_raw is None:
logger.debug(f"NO experiment data for {data_set}")


# Logic for datasets containing "published_auc" but not "auc"
elif experiments_raw['dose_response_metric'].isin(['published_auc']).any():
experiment = data_sets[data_set].format(
data_type='experiments',
shape='wide',
Expand All @@ -176,8 +191,29 @@ def process_datasets(args):
)
experiment.rename(columns={'published_auc': 'auc'}, inplace=True)
experiments.append(experiment)
# not all Datasets have experiments / drug response data
elif data_sets[data_set].experiments is not None:

# Logic for PDX datasets that don't have `auc` but mRECIST (note
# the typo currently in the `drugresponse_metric` column).
elif experiments_raw['dose_response_metric'].isin(['mRESCIST']).any():
experiment = data_sets[data_set].format(
data_type='experiments',
shape='wide',
metrics=[
'mRESCIST',
],
)
# conversion logic from mRECIST -> auc
experiment.loc[experiment['mRESCIST'] == 'CR', 'mRESCIST'] = 0.1
experiment.loc[experiment['mRESCIST'] == 'PR', 'mRESCIST'] = 0.2
experiment.loc[experiment['mRESCIST'] == 'SD', 'mRESCIST'] = 0.5
experiment.loc[experiment['mRESCIST'] == 'PD', 'mRESCIST'] = 1.0

experiment.rename(columns={'mRESCIST': 'auc'}, inplace=True)
experiments.append(experiment)

# The remaining datasets should have `auc` as
# drug_response_metric available in the `experiments` table
else:
logger.debug(f"experiment data found for {data_set}")
# formatting existing response data to wide
experiment = data_sets[data_set].format(
Expand All @@ -196,8 +232,6 @@ def process_datasets(args):
],
)
experiments.append(experiment.dropna())
else:
logger.debug(f"NO experiment data for {data_set}")

# concatenating existing response data and "clean up"
logger.debug("concatenating experiment data ...")
Expand Down Expand Up @@ -326,7 +360,82 @@ def process_datasets(args):
)
)

#-------------------------------------------------------------------
# create proteomics master table
#-------------------------------------------------------------------

proteomics = merge_master_tables(
args=args,
data_sets=data_sets,
data_type='proteomics'
)

####
# Imputation step:
# currently we are imputing by generating the mean over all samples
# in wich the protein was detected across all datasets.
# The missing values are the back filled for each protein.
####
proteomics = (
proteomics
# the proteomics table has the transposed first (see below)
# due to .fillna not working as expected with axis==1
.T
.fillna(
# the filling of NAs with 'value' is not implemented for
# axis==1, despite what is documented for pandas>2.0.0
value=proteomics.median(axis=1, skipna=True),
axis=0
)
.T # transpose back into original orientation
)
# merging ensemble gene id & gene symbol into the proteomics
# data
proteomics = pd.merge(
proteomics,
data_gene_names[[
'entrez_id',
'ensembl_gene_id',
'gene_symbol'
]],
how='left',
on='entrez_id',
)

# moving ensemble_id & gene_symbol columns to the front of the table
# such that when transposing the DataFrame they are row 3 and 2
# respectively
proteomics.insert(
1,
'gene_symbol',
proteomics.pop('gene_symbol')
)
proteomics.insert(
0,
'ensembl_gene_id',
proteomics.pop('ensembl_gene_id')
)

proteomics = proteomics[proteomics['entrez_id'] != 0]
proteomics = proteomics.fillna(0).T.reset_index()
for i in range(0,3):
proteomics.iloc[i,0] = np.nan

# writing the proteomics datatable to '/x_data/*_proteomics.tsv'
outfile_path = args.WORKDIR.joinpath(
"data_out",
"x_data",
"cancer_proteomics.tsv"
)
(proteomics
.to_csv(
path_or_buf=outfile_path,
sep='\t',
header=False,
index=False
)
)

#-------------------------------------------------------------------
# create copynumber master table & discretized table
#-------------------------------------------------------------------
Expand Down Expand Up @@ -688,13 +797,21 @@ def split_data_sets(
args: dict,
data_sets: dict,
data_sets_names: list,
response_data: pd.DataFrame
response_data: pd.DataFrame,
):

splits_folder = args.WORKDIR.joinpath('data_out', 'splits')
split_type = args.SPLIT_TYPE
ratio = (8,1,1)
stratify_by = None
stratify_by = args.BALANCE_BY
if stratify_by is not None:
balance = True
quantiles = False
num_classes = 4
else:
balance = False
quantiles = True
num_classes = 4
if args.RANDOM_SEEDS is not None:
random_seeds = args.RANDOM_SEEDS
else:
Expand Down Expand Up @@ -743,6 +860,9 @@ def split_data_sets(
split_type=split_type,
ratio=ratio,
stratify_by=stratify_by,
balance=balance,
quantiles=quantiles,
num_classes=num_classes,
random_state=random_seeds[i]
)
train_keys = (
Expand Down Expand Up @@ -869,7 +989,7 @@ def merge_master_tables(args, data_sets, data_type: str='transcriptomics'):
for data_set in data_sets:
if data_sets[data_set].experiments is not None:
if (
data_type in ['transcriptomics', 'copy_number'] and
data_type in ['transcriptomics', 'copy_number', 'proteomics'] and
getattr(data_sets[data_set], data_type, None) is not None
):
dfs_to_merge.append(
Expand Down