diff --git a/build/beatAML/GetBeatAML.py b/build/beatAML/GetBeatAML.py index 5afdff8e..c963db93 100755 --- a/build/beatAML/GetBeatAML.py +++ b/build/beatAML/GetBeatAML.py @@ -9,25 +9,25 @@ import argparse import time -def download_from_github(raw_url, save_path): - """ - Download a file from a raw GitHub URL and save to the specified path. - - Parameters - ---------- - raw_url : str - The raw GitHub URL pointing to the file to be downloaded. - save_path : str - The local path where the downloaded file will be saved. - - Returns - ------- - None - """ - response = requests.get(raw_url) - with open(save_path, 'wb') as f: - f.write(response.content) - return +# def download_from_github(raw_url, save_path): +# """ +# Download a file from a raw GitHub URL and save to the specified path. + +# Parameters +# ---------- +# raw_url : str +# The raw GitHub URL pointing to the file to be downloaded. +# save_path : str +# The local path where the downloaded file will be saved. + +# Returns +# ------- +# None +# """ +# response = requests.get(raw_url) +# with open(save_path, 'wb') as f: +# f.write(response.content) +# return def retrieve_figshare_data(url): """ @@ -178,14 +178,14 @@ def retrieve_drug_info(compound_name): properties = data["PropertyTable"]["Properties"][0] pubchem_id = properties.get('CID',np.nan) canSMILES = properties.get("CanonicalSMILES", np.nan) - isoSMILES = properties.get("IsomericSMILES", np.nan) + # isoSMILES = properties.get("IsomericSMILES", np.nan) InChIKey = properties.get("InChIKey", np.nan) formula = properties.get("MolecularFormula", np.nan) weight = properties.get("MolecularWeight", np.nan) - return pubchem_id, canSMILES, isoSMILES, InChIKey, formula, weight + return pubchem_id, canSMILES, InChIKey, formula, weight else: - return np.nan, np.nan, np.nan, np.nan, np.nan, np.nan + return np.nan, np.nan, np.nan, np.nan, np.nan def update_dataframe_with_pubchem(d_df): @@ -230,14 +230,14 @@ def update_dataframe_with_pubchem(d_df): if row['chem_name'] in data_dict and not all(pd.isna(val) for val in data_dict[row['chem_name']]): values = data_dict[row['chem_name']] else: - values = data_dict.get(row['other_name'], (np.nan, np.nan, np.nan, np.nan, np.nan, np.nan)) + values = data_dict.get(row['other_name'], (np.nan, np.nan, np.nan, np.nan, np.nan)) d_df.at[idx, 'pubchem_id'] = values[0] d_df.at[idx, "canSMILES"] = values[1] - d_df.at[idx, "isoSMILES"] = values[2] - d_df.at[idx, "InChIKey"] = values[3] - d_df.at[idx, "formula"] = values[4] - d_df.at[idx, "weight"] = values[5] + # d_df.at[idx, "isoSMILES"] = values[2] + d_df.at[idx, "InChIKey"] = values[2] + d_df.at[idx, "formula"] = values[3] + d_df.at[idx, "weight"] = values[4] return d_df @@ -250,24 +250,24 @@ def merge_drug_info(d_df,drug_map): d_df : pd.DataFrame Main drug dataframe containing drug-related columns. drug_map : pd.DataFrame - Mapping dataframe containing drug information and the column 'isoSMILES'. + Mapping dataframe containing drug information and the column 'canSMILES'. Returns ------- pd.DataFrame The merged dataframe containing combined drug information. """ - print(d_df['isoSMILES'].dtype, drug_map['isoSMILES'].dtype) - d_df['isoSMILES'] = d_df['isoSMILES'].astype(str) - drug_map['isoSMILES'] = drug_map['isoSMILES'].astype(str) - result_df = d_df.merge(drug_map[['isoSMILES', 'improve_drug_id']], on='isoSMILES', how='left') + # print(d_df['isoSMILES'].dtype, drug_map['isoSMILES'].dtype) + d_df['canSMILES'] = d_df['canSMILES'].astype(str) + drug_map['canSMILES'] = drug_map['canSMILES'].astype(str) + result_df = d_df.merge(drug_map[['canSMILES', 'improve_drug_id']], on='canSMILES', how='left') return result_df def format_drug_map(drug_map_path): """ Format and clean up the drug mapping file. - Reads a drug map file, removes duplicates based on the 'isoSMILES' column, + Reads a drug map file, removes duplicates based on the 'canSMILES' column, and returns the cleaned dataframe. Parameters @@ -282,11 +282,11 @@ def format_drug_map(drug_map_path): """ if drug_map_path: drug_map = pd.read_csv(drug_map_path, sep = "\t") - drug_map = drug_map.drop_duplicates(subset='isoSMILES', keep='first') + drug_map = drug_map.drop_duplicates(subset='canSMILES', keep='first') else: drug_map = pd.DataFrame(columns=[ - 'improve_drug_id', 'chem_name', 'pubchem_id', 'canSMILES', - 'isoSMILES', 'InChIKey', 'formula', 'weight' + 'improve_drug_id', 'chem_name', 'pubchem_id', + 'canSMILES', 'InChIKey', 'formula', 'weight' ]) return drug_map @@ -316,7 +316,7 @@ def format_drug_df(drug_path): def add_improve_id(previous_df, new_df): """ - Add 'improve_drug_id' to the new dataframe based on unique 'isoSMILES' not present in the previous dataframe. + Add 'improve_drug_id' to the new dataframe based on unique 'canSMILES' not present in the previous dataframe. Parameters ---------- @@ -335,16 +335,16 @@ def add_improve_id(previous_df, new_df): max_id = max(id_list) if id_list else 0 else: max_id = 0 - # Identify isoSMILES in the new dataframe that don't exist in the old dataframe - unique_new_smiles = set(new_df['isoSMILES']) - set(previous_df['isoSMILES']) - # Identify rows in the new dataframe with isoSMILES that are unique and where improve_drug_id is NaN - mask = (new_df['isoSMILES'].isin(unique_new_smiles)) & (new_df['improve_drug_id'].isna()) + # Identify canSMILES in the new dataframe that don't exist in the old dataframe + unique_new_smiles = set(new_df['canSMILES']) - set(previous_df['canSMILES']) + # Identify rows in the new dataframe with canSMILES that are unique and where improve_drug_id is NaN + mask = (new_df['canSMILES'].isin(unique_new_smiles)) & (new_df['improve_drug_id'].isna()) id_map = {} for smiles in unique_new_smiles: max_id += 1 id_map[smiles] = f"SMI_{max_id}" - # Apply the mapping to the new dataframe for rows with unique isoSMILES and NaN improve_drug_id - new_df.loc[mask, 'improve_drug_id'] = new_df['isoSMILES'].map(id_map) + # Apply the mapping to the new dataframe for rows with unique canSMILES and NaN improve_drug_id + new_df.loc[mask, 'improve_drug_id'] = new_df['canSMILES'].map(id_map) return new_df @@ -466,8 +466,14 @@ def map_and_combine(df, data_type, entrez_map_file, improve_map_file, map_file=N right_on='other_id', how='left') mapped_df.insert(0, 'improve_sample_id', mapped_df.pop('improve_sample_id')) + + print(mapped_df.to_string()) + mapped_df['improve_sample_id'] = mapped_df['improve_sample_id'].astype(int) + mapped_df['entrez_id'] = mapped_df['entrez_id'].fillna(0) + mapped_df['entrez_id'] = mapped_df['entrez_id'].astype(int) mapped_df['source'] = 'synapse' mapped_df['study'] = 'BeatAML' + mapped_df =mapped_df.drop_duplicates() final_dataframe = mapped_df.dropna() return final_dataframe @@ -541,7 +547,7 @@ def generate_drug_list(drug_map_path,drug_path): d_res = add_improve_id(drug_map, d_res) #Drug Data #print(d_res) - drug_res = d_res[["improve_drug_id","chem_name","pubchem_id","formula","weight","InChIKey","canSMILES","isoSMILES"]] + drug_res = d_res[["improve_drug_id","chem_name","pubchem_id","formula","weight","InChIKey","canSMILES"]] drug_res = drug_res.drop_duplicates() drug_res.to_csv("/tmp/beataml_drugs.tsv",sep="\t", index=False) @@ -587,7 +593,12 @@ def generate_drug_list(drug_map_path,drug_path): # 'syn32533104', # 'syn32529921', 'syn26642974', - 'syn26427390' + 'syn26427390', + 'syn64126458', + 'syn64126462', + 'syn64126463', + 'syn64126464', + 'syn64126468' ] print("Downloading Files from Synapse") for entity_id in entity_ids: @@ -597,13 +608,13 @@ def generate_drug_list(drug_map_path,drug_path): #gene_url = "https://figshare.com/ndownloader/files/40576109?private_link=525f7777039f4610ef47" #entrez_map_file = retrieve_figshare_data(gene_url) - additional_mapping_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_sample_mapping.xlsx" + # additional_mapping_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_sample_mapping.xlsx" sample_mapping_file = "beataml_waves1to4_sample_mapping.xlsx" - download_from_github(additional_mapping_url, sample_mapping_file) + # download_from_github(additional_mapping_url, sample_mapping_file) - supplementary_url = 'https://ars.els-cdn.com/content/image/1-s2.0-S1535610822003129-mmc2.xlsx' + # supplementary_url = 'https://ars.els-cdn.com/content/image/1-s2.0-S1535610822003129-mmc2.xlsx' supplimentary_file = '1-s2.0-S1535610822003129-mmc2.xlsx' - download_from_github(supplementary_url, supplimentary_file) + # download_from_github(supplementary_url, supplimentary_file) if args.samples: @@ -619,9 +630,9 @@ def generate_drug_list(drug_map_path,drug_path): else: print("Drug File Provided. Proceeding with build.") original_drug_file = "beataml_wv1to4_raw_inhibitor_v4_dbgap.txt" - original_drug_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_wv1to4_raw_inhibitor_v4_dbgap.txt" - download_from_github(original_drug_url, original_drug_file) - generate_drug_list(args.drugFile, original_drug_file) ##this doesn't exist, need to add + # original_drug_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_wv1to4_raw_inhibitor_v4_dbgap.txt" + # download_from_github(original_drug_url, original_drug_file) + generate_drug_list(args.drugFile, original_drug_file) if args.omics: if args.genes is None or args.curSamples is None: print('Cannot process omics without sample mapping and gene mapping files') @@ -629,16 +640,16 @@ def generate_drug_list(drug_map_path,drug_path): else: improve_map_file = args.curSamples transcriptomics_file = "beataml_waves1to4_counts_dbgap.txt" #"beataml_waves1to4_norm_exp_dbgap.txt" ##this is the wrong file, these are the normalize values - transcriptomics_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_counts_dbgap.txt" #"https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_norm_exp_dbgap.txt" - download_from_github(transcriptomics_url, transcriptomics_file) + # transcriptomics_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_counts_dbgap.txt" #"https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_norm_exp_dbgap.txt" + # download_from_github(transcriptomics_url, transcriptomics_file) mutations_file = "beataml_wes_wv1to4_mutations_dbgap.txt" - mutations_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_wes_wv1to4_mutations_dbgap.txt" - download_from_github(mutations_url, mutations_file) + # mutations_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_wes_wv1to4_mutations_dbgap.txt" + # download_from_github(mutations_url, mutations_file) mutation_map_file = "beataml_waves1to4_sample_mapping.xlsx" - mutation_map_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_sample_mapping.xlsx" - download_from_github(mutation_map_url, mutation_map_file) + # mutation_map_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_sample_mapping.xlsx" + # download_from_github(mutation_map_url, mutation_map_file) # New Transcriptomics Data print("Starting Transcriptomics Data") ##first run conversion tool @@ -680,9 +691,9 @@ def generate_drug_list(drug_map_path,drug_path): imp_samp_map = pd.read_csv(args.curSamples) imp_drug_map = pd.read_csv(args.drugFile,sep='\t') original_drug_file = "beataml_wv1to4_raw_inhibitor_v4_dbgap.txt" - original_drug_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_wv1to4_raw_inhibitor_v4_dbgap.txt" + # original_drug_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_wv1to4_raw_inhibitor_v4_dbgap.txt" # Generate Raw Drugs File to use in Curve fitting algorithm - download_from_github(original_drug_url, original_drug_file) + # download_from_github(original_drug_url, original_drug_file) # Experiment Data updated_raw_drug_file = "beatAML_drug_raw.tsv" generate_raw_drug_file(original_drug_file,sample_mapping_file, updated_raw_drug_file,supplimentary_file) diff --git a/build/broad_sanger/03a-nci60Drugs.py b/build/broad_sanger/03a-nci60Drugs.py index 28720c90..529c002e 100644 --- a/build/broad_sanger/03a-nci60Drugs.py +++ b/build/broad_sanger/03a-nci60Drugs.py @@ -39,7 +39,7 @@ def main(): opts = parser.parse_args() ###primary DF - df = {'improve_drug_id':[],'chem_name':[],'canSMILES':[],'isoSMILES':[],\ + df = {'improve_drug_id':[],'chem_name':[],'canSMILES':[],\ 'InChIKey':[],'formula':[],'weight':[],'pubchem_id':[]} print('Downloading NSC identifiers for nci60 data') @@ -69,7 +69,7 @@ def main(): upper=[a.upper() for a in smiles['SMILES']] smiles= pl.DataFrame({'NSC':smiles['NSC'],'upper':upper})#smiles.with_columns(upper=upper) ##reduce to smiels only in current drugs - ssmiles = smiles.filter(~pl.col('upper').is_in(curdrugs['isoSMILES'])) + # ssmiles = smiles.filter(~pl.col('upper').is_in(curdrugs['isoSMILES'])) ssmiles = ssmiles.filter(~pl.col('upper').is_in(curdrugs['canSMILES'])) pubchems = pubchems.filter(pl.col('NSC').is_in(ssmiles['NSC'])) arr = set(pubchems['CID']) @@ -102,7 +102,7 @@ def main(): { "improve_drug_id": ["SMI_"+str(a) for a in range(max_imp+1,max_imp+1+smicount,1)], 'canSMILES': [a for a in set(mdf['SMILES'])], - 'isoSMILES': [a for a in set(mdf['SMILES'])], + # 'isoSMILES': [a for a in set(mdf['SMILES'])], 'InChIKey': [None for a in range(smicount)], 'formula': [None for a in range(smicount)], 'weight': [None for a in range(smicount)] diff --git a/build/broad_sanger/04b-nci60-updated.py b/build/broad_sanger/04b-nci60-updated.py index b6a82001..c515f59d 100644 --- a/build/broad_sanger/04b-nci60-updated.py +++ b/build/broad_sanger/04b-nci60-updated.py @@ -107,10 +107,11 @@ def main(): finaldf = pl.DataFrame( { - 'source':['NCI60' for a in molar['improve_drug_id']], ##2024 build + 'source':['NCI60_24' for a in molar['improve_drug_id']], ##2024 build 'improve_sample_id':molar['improve_sample_id'], 'Drug':molar['improve_drug_id'], - 'study': molar['EXPID'],#['NCI60' for a in nonulls['improve_drug_id']], + # 'study': molar['EXPID'],#['NCI60' for a in nonulls['improve_drug_id']], + 'study': "NCI60", 'time':molar['time'], 'time_unit':molar['time_unit'], 'DOSE': [(10**a)*1000000 for a in molar['CONCENTRATION']], ##move from molar to uM to match pharmacoDB diff --git a/build/broad_sanger/05a_remove_problem_drugs.py b/build/broad_sanger/05a_remove_problem_drugs.py new file mode 100644 index 00000000..a240fc14 --- /dev/null +++ b/build/broad_sanger/05a_remove_problem_drugs.py @@ -0,0 +1,43 @@ +import gc +import polars as pl + + + +def main(): + + # Remove Problematic Drugs before Splitting Data + + # Load the datasets + all_drugs = pl.read_csv("broad_sanger_drugs.tsv", separator="\t") + all_experiments = pl.read_csv("broad_sanger_experiments.tsv", separator="\t") + + # Define the brd_list with lowercase entries for case-insensitive matching + brd_list = [ + 'brd-k03911514', + 'brd-k07442505', + 'brd-k13185470', + 'brd-k16130065', + 'brd-k20514654', + 'brd-k27188169', + 'brd-k55473186', + 'yl54', + 'brd-k58730230', + 'brd-k79669418', + 'brd-k99584050'] + + # Identify rows in all_drugs that match brd_list entries (case insensitive) + removed_drugs = all_drugs.filter(pl.col("chem_name").str.to_lowercase().is_in(brd_list)) + + # Store the improve_drug_id IDs of removed entries + improve_drug_id = removed_drugs["improve_drug_id"].to_list() + + # Remove these rows from all_drugs and all_experiments + all_drugs = all_drugs.filter(~pl.col("improve_drug_id").is_in(improve_drug_id)) + all_experiments = all_experiments.filter(~pl.col("improve_drug_id").is_in(improve_drug_id)) + + all_drugs.write_csv("broad_sanger_drugs.tsv", separator="\t") + all_experiments.write_csv("broad_sanger_experiments.tsv", separator="\t") + + +if __name__ == "__main__": + main() diff --git a/build/broad_sanger/05_separate_datasets.py b/build/broad_sanger/05b_separate_datasets.py similarity index 99% rename from build/broad_sanger/05_separate_datasets.py rename to build/broad_sanger/05b_separate_datasets.py index 9e99adc9..f4d5481d 100644 --- a/build/broad_sanger/05_separate_datasets.py +++ b/build/broad_sanger/05b_separate_datasets.py @@ -4,7 +4,6 @@ def main(): - datasets_to_process = ["CCLE", "CTRPv2", "PRISM", "GDSCv1", "GDSCv2", "FIMM", "gCSI", "NCI60"] omics_datatypes = ["transcriptomics","proteomics", "copy_number","mutations"] # csv samples_datatypes = ["samples"] #csv diff --git a/build/broad_sanger/build_misc.sh b/build/broad_sanger/build_misc.sh index 2fa847f1..d5896e98 100644 --- a/build/broad_sanger/build_misc.sh +++ b/build/broad_sanger/build_misc.sh @@ -4,8 +4,12 @@ set -euo pipefail trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit 1' ERR cp /tmp/broad_sanger* . -echo "Running 05_separate_datasets.py..." -/opt/venv/bin/python 05_separate_datasets.py + +echo "Running 05a_remove_problem_drugs.py..." +/opt/venv/bin/python 05a_remove_problem_drugs.py + +echo "Running 05b_separate_datasets.py..." +/opt/venv/bin/python 05b_separate_datasets.py echo "Removing broad_sanger* files..." rm broad_sanger* diff --git a/build/build_dataset.py b/build/build_dataset.py index 9e63030b..dd585604 100644 --- a/build/build_dataset.py +++ b/build/build_dataset.py @@ -41,6 +41,7 @@ def process_docker(dataset,validate): 'hcmi': ['hcmi'], 'beataml': ['beataml'], 'mpnst': ['mpnst'], + 'mpnstpdx': ['mpnstpdx'], 'cptac': ['cptac'], 'genes': ['genes'], 'upload': ['upload'] @@ -121,7 +122,8 @@ def process_omics(executor, dataset, should_continue): 'mpnst': ['copy_number', 'mutations', 'proteomics', 'transcriptomics'], 'broad_sanger': ['copy_number', 'mutations', 'proteomics', 'transcriptomics'], 'cptac': ['copy_number', 'mutations', 'proteomics', 'transcriptomics'], - 'hcmi': ['mutations', 'transcriptomics'] + 'hcmi': ['mutations', 'transcriptomics'], + 'mpnstpdx':['copy_number', 'mutations', 'proteomics', 'transcriptomics'] } expected_omics = dataset_omics_files.get(dataset, []) @@ -216,7 +218,7 @@ def run_docker_validate_cmd(cmd_arr, all_files_dir, name): Wrapper for 'docker run' command used during validation and uploads. ''' env = os.environ.copy() - docker_run = ['docker', 'run', '-v', f"{env['PWD']}/local/{all_files_dir}:/tmp"] + docker_run = ['docker', 'run', '-v', f"{env['PWD']}/local/{all_files_dir}:/tmp", '--platform=linux/amd64'] docker_run.extend(['upload']) docker_run.extend(cmd_arr) print('Executing:', ' '.join(docker_run)) @@ -256,7 +258,7 @@ def run_schema_checker(dataset): decompress_file(os.path.join('local', all_files_dir, file)) # Run schema checker - schema_check_command = ['python3', 'scripts/check_schema.py', '--datasets'] + datasets + schema_check_command = ['python3', 'check_schema.py', '--datasets'] + datasets run_docker_validate_cmd(schema_check_command, all_files_dir, 'Validation') def main(): diff --git a/build/cptac/getCptacData.py b/build/cptac/getCptacData.py index 527ff700..c3c401a2 100755 --- a/build/cptac/getCptacData.py +++ b/build/cptac/getCptacData.py @@ -380,11 +380,15 @@ def main(): dat_files[dtype_key] = fdf2 else: dat_files[dtype_key] = fdf.dropna() + print(dtype_key) # Now concatenate all the cancers into a single file for dtype_key, df in dat_files.items(): print('Saving ' + "cptac_" + dtype_key + '.csv.gz' + ' file') + print(df.to_string()) + df['entrez_id'] = df['entrez_id'].fillna(0) + df['entrez_id'] = df['entrez_id'].astype(int) df.to_csv("/tmp/" + "cptac_" + dtype_key + '.csv.gz', sep=',', index=False, compression='gzip') if __name__ == '__main__': diff --git a/build/docker/Dockerfile.broad_sanger_omics b/build/docker/Dockerfile.broad_sanger_omics index 2d4cbde8..43f4a428 100755 --- a/build/docker/Dockerfile.broad_sanger_omics +++ b/build/docker/Dockerfile.broad_sanger_omics @@ -34,7 +34,8 @@ ADD build/broad_sanger/build_samples.sh ./ ADD build/broad_sanger/build_omics.sh ./ ADD build/utils/* ./ ADD build/broad_sanger/build_misc.sh ./ -ADD build/broad_sanger/05_separate_datasets.py ./ +ADD build/broad_sanger/05a_remove_problem_drugs.py ./ +ADD build/broad_sanger/05b_separate_datasets.py ./ ADD build/broad_sanger/requirements.txt . ADD build/broad_sanger/omics_requirements.r . diff --git a/build/docker/Dockerfile.mpnst b/build/docker/Dockerfile.mpnst index 92fe9c6d..94e04e87 100755 --- a/build/docker/Dockerfile.mpnst +++ b/build/docker/Dockerfile.mpnst @@ -1,4 +1,4 @@ -FROM r-base:4.3.2 +FROM r-base:4.3.3 # Set environment to noninteractive ENV DEBIAN_FRONTEND=noninteractive diff --git a/build/docker/Dockerfile.mpnstPDX b/build/docker/Dockerfile.mpnstPDX new file mode 100755 index 00000000..86a3c02e --- /dev/null +++ b/build/docker/Dockerfile.mpnstPDX @@ -0,0 +1,54 @@ +FROM r-base:4.3.2 + +# Set environment to noninteractive +ENV DEBIAN_FRONTEND=noninteractive + +# Update package list and install required packages +RUN apt-get update && \ + apt-get install -y build-essential wget curl libcurl4-openssl-dev libxml2-dev \ + zlib1g-dev libssl-dev libbz2-dev libreadline-dev libsqlite3-dev libffi-dev + +# Download and compile Python 3.10 with shared library support +RUN wget https://www.python.org/ftp/python/3.10.12/Python-3.10.12.tgz && \ + tar -xf Python-3.10.12.tgz && \ + cd Python-3.10.12 && \ + ./configure --enable-optimizations --enable-shared && \ + make -j$(nproc) && \ + make altinstall && \ + cd .. && \ + rm -rf Python-3.10.12.tgz Python-3.10.12 + +# Set Python 3.10 as default +RUN ln -s /usr/local/bin/python3.10 /usr/bin/python3 && \ + ln -s /usr/local/bin/pip3.10 /usr/bin/pip3 + +# Update library paths for Python shared library +RUN echo "/usr/local/lib" >> /etc/ld.so.conf.d/python3.10.conf && ldconfig + +# Create a Python virtual environment +RUN python3 -m venv /opt/venv +RUN /opt/venv/bin/pip install --upgrade pip + +# Set environment variables for reticulate +ENV RETICULATE_PYTHON="/opt/venv/bin/python3" +ENV PYTHONPATH=/app#"${PYTHONPATH}:/app" +WORKDIR /app + +# Set MPLCONFIGDIR to a writable directory +ENV MPLCONFIGDIR=/app/tmp/matplotlib +RUN mkdir -p /app/tmp/matplotlib + +# Add necessary files to the container +ADD build/mpnstpdx/requirements.txt . +ADD build/mpnstpdx/requirements.r . +ADD build/mpnstpdx/* ./ +ADD build/utils/* ./ + +# installing python libraries +RUN /opt/venv/bin/pip3 install -r requirements.txt + +# Install all R libraries from requirements.r +RUN Rscript requirements.r + +# Set up volume for temporary storage +VOLUME ["/tmp"] diff --git a/build/docker/docker-compose.yml b/build/docker/docker-compose.yml index fb9c9ab8..373eb5d6 100644 --- a/build/docker/docker-compose.yml +++ b/build/docker/docker-compose.yml @@ -44,7 +44,15 @@ services: HTTPS_PROXY: ${HTTPS_PROXY} platform: linux/amd64 image: mpnst:latest - + + mpnstpdx: + build: + context: ../../ + dockerfile: build/docker/Dockerfile.mpnstpdx + args: + HTTPS_PROXY: ${HTTPS_PROXY} + platform: linux/amd64 + image: mpnstpdx:latest cptac: build: context: ../../ diff --git a/build/hcmi/02-getHCMIData.py b/build/hcmi/02-getHCMIData.py index a7e3e74a..b329a68a 100644 --- a/build/hcmi/02-getHCMIData.py +++ b/build/hcmi/02-getHCMIData.py @@ -17,67 +17,93 @@ import polars as pl import gc import hashlib +from pathlib import Path def download_tool(url): """ - Download, extract, and make a tool (GDC Client) executable from the provided URL. + Download, extract, and prepare the GDC client tool. Parameters ---------- url : str - The URL from where the tool needs to be downloaded. + The URL to download the tool from. Returns ------- str - Name of the downloaded file. + The path to the `gdc-client` executable. """ - + # Download the file + print("Downloading tool...") filename = wget.download(url) - files_before = os.listdir() + + # First extraction + print(f"\nExtracting {filename}...") shutil.unpack_archive(filename) - files_after = os.listdir() - new_file = str(next(iter((set(files_after) - set(files_before))))) - st = os.stat(new_file) - os.chmod(new_file, st.st_mode | stat.S_IEXEC) - return filename + os.remove(filename) + + # Check for a nested zip file and extract again + extracted_files = [f for f in os.listdir() if os.path.isfile(f) and f.endswith(".zip")] + for zip_file in extracted_files: + print(f"Extracting nested archive: {zip_file}...") + shutil.unpack_archive(zip_file) + os.remove(zip_file) + + gdc_client_path = None + for root, dirs, files in os.walk("."): + if "gdc-client" in files: + gdc_client_path = os.path.join(root, "gdc-client") + break + + if not gdc_client_path: + raise FileNotFoundError("`gdc-client` executable not found after extraction.") + + # Ensure `gdc-client` is executable + print(f"Making {gdc_client_path} executable...") + st = os.stat(gdc_client_path) + os.chmod(gdc_client_path, st.st_mode | stat.S_IEXEC) + + return gdc_client_path def is_tool(name): """ - Check if a specific tool is available on the system or in the current directory. + Check if a specific tool is available on the system. Parameters ---------- name : str - The name of the tool to check. + The name of the tool. Returns ------- bool True if the tool is found, otherwise False. """ - - return which(name) is not None or name in os.listdir() + return shutil.which(name) is not None or name in os.listdir() def ensure_gdc_client(): """ - Ensure that the gdc-client is available on the system. + Ensure that the GDC client tool is available on the system. - If the gdc-client tool isn't found, this function will automatically - download the appropriate version based on the operating system. + If the tool isn't found, this function downloads and prepares it. """ - tool_name = "gdc-client" if not is_tool(tool_name): - print("Downloading gdc-client") + print("GDC client not found. Downloading...") urls = { - "Darwin": 'https://gdc.cancer.gov/files/public/file/gdc-client_v1.6.1_OSX_x64.zip', - "Windows": 'https://gdc.cancer.gov/files/public/file/gdc-client_v1.6.1_Windows_x64.zip', - "Linux": 'https://gdc.cancer.gov/files/public/file/gdc-client_v1.6.1_Ubuntu_x64.zip' + "Darwin": "https://gdc.cancer.gov/system/files/public/file/gdc-client_2.3_OSX_x64-py3.8-macos-14.zip", + "Windows": "https://gdc.cancer.gov/system/files/public/file/gdc-client_2.3_Windows_x64-py3.8-windows-2019.zip", + "Linux": "https://gdc.cancer.gov/system/files/public/file/gdc-client_2.3_Ubuntu_x64-py3.8-ubuntu-20.04.zip" } - download_tool(urls.get(platform.system())) + os_type = platform.system() + url = urls.get(os_type) + if not url: + raise ValueError(f"Unsupported OS: {os_type}") + gdc_client_path = download_tool(url) + print(f"`gdc-client` downloaded and available at {gdc_client_path}") else: - print("gdc-client already installed") + print("`gdc-client` is already installed.") + def extract_uuids_from_manifest(manifest_data): """ diff --git a/build/mpnst/01_mpnst_get_omics.R b/build/mpnst/01_mpnst_get_omics.R index 77a5466d..a1276eff 100755 --- a/build/mpnst/01_mpnst_get_omics.R +++ b/build/mpnst/01_mpnst_get_omics.R @@ -34,6 +34,7 @@ samples_df <- fread(patients)|> pdx_samps<-subset(samples_df,model_type=='patient derived xenograft') tumor_samps<-subset(samples_df,model_type=='tumor') +mt_samps<-subset(samples_df,model_type=='organoid') ##now get the manifest from synapse manifest<-synapser::synTableQuery("select * from syn53503360")$asDataFrame()|> @@ -45,26 +46,35 @@ manifest<-synapser::synTableQuery("select * from syn53503360")$asDataFrame()|> ##they each get their own sample identifier pdx_data<-manifest|>dplyr::select(common_name,starts_with("PDX"))|> left_join(pdx_samps)|> - dplyr::select(improve_sample_id,RNASeq='PDX_RNASeq',Mutations='PDX_Somatic_Mutations',CopyNumber='PDX_CNV',Proteomics='PDX_Proteomics') + dplyr::select(improve_sample_id,common_name,model_type,RNASeq='PDX_RNASeq',Mutations='PDX_Somatic_Mutations',CopyNumber='PDX_CNV',Proteomics='PDX_Proteomics')|> + subset(!is.na(improve_sample_id)) tumor_data<- manifest|>dplyr::select(common_name,starts_with("Tumor"))|> left_join(tumor_samps)|> - dplyr::select(improve_sample_id,RNASeq='Tumor_RNASeq',Mutations='Tumor_Somatic_Mutations',CopyNumber='Tumor_CNV')|> - mutate(Proteomics='') ##we dont have tumor proteomics from these samples + dplyr::select(improve_sample_id,common_name,model_type,RNASeq='Tumor_RNASeq',Mutations='Tumor_Somatic_Mutations',CopyNumber='Tumor_CNV')|> + mutate(Proteomics='')|> + subset(!is.na(improve_sample_id)) + ##we dont have tumor proteomics from these samples +#print(tumor_data) + +mt_data<- manifest|>dplyr::select(common_name,starts_with("PDX"))|> + left_join(mt_samps)|> + dplyr::select(improve_sample_id,common_name,model_type, RNASeq='PDX_RNASeq',Mutations='PDX_Somatic_Mutations',CopyNumber='PDX_CNV',Proteomics='PDX_Proteomics')|>##we dont have mt data yet, so collecting PDX instead + subset(!is.na(improve_sample_id)) #print(tumor_data) -combined<-rbind(pdx_data,tumor_data)|>distinct() +combined<-rbind(pdx_data,tumor_data,mt_data)|>distinct() # gene mapping table genes_df <- fread(genefile) ##added proteomics first -proteomics<-do.call('rbind',lapply(setdiff(combined$Proteomics,c('',NA,"NA")),function(x){ +proteomics<-do.call('rbind',lapply(setdiff(mt_data$Proteomics,c('',NA,"NA")),function(x){ # if(x!=""){ #print(x) - sample<-subset(combined,Proteomics==x) + sample<-subset(mt_data,Proteomics==x) #print(sample) res<-fread(synGet(x)$path)|> #tidyr::separate(Name,into=c('other_id','vers'),sep='\\.')|> @@ -88,10 +98,10 @@ fwrite(proteomics,'/tmp/mpnst_proteomics.csv.gz') #### FIRST WE GET RNASeq Data -rnaseq<-do.call('rbind',lapply(setdiff(combined$RNASeq,c(NA,"NA")),function(x){ +rnaseq<-do.call('rbind',lapply(setdiff(mt_data$RNASeq,c(NA,"NA")),function(x){ # if(x!=""){ #print(x) - sample<-subset(combined,RNASeq==x) + sample<-subset(mt_data,RNASeq==x) #print(sample) res<-fread(synGet(x)$path)|> tidyr::separate(Name,into=c('other_id','vers'),sep='\\.')|> @@ -114,11 +124,11 @@ fwrite(rnaseq,'/tmp/mpnst_transcriptomics.csv.gz') #####NEXT WE DO WES DATA print("Getting WES") -wes<-do.call(rbind,lapply(setdiff(combined$`Mutations`,c(NA,"NA")),function(x){ +wes<-do.call(rbind,lapply(setdiff(mt_data$`Mutations`,c(NA,"NA")),function(x){ x2=x#gsub('"','',gsub("[",'',gsub("]",'',x,fixed=T),fixed=T),fixed=T) print(x) - sample<-subset(combined,Mutations==x) + sample<-subset(mt_data,Mutations==x) print(sample$improve_sample_id) res<-NULL try(res<-fread(synGet(x2)$path)|> @@ -141,11 +151,11 @@ fwrite(wes,'/tmp/mpnst_mutations.csv.gz') print(paste("getting CNV")) ##next let's do CNVs! -cnv<-do.call(rbind,lapply(setdiff(combined$CopyNumber,c(NA,"NA")),function(x){ +cnv<-do.call(rbind,lapply(setdiff(mt_data$CopyNumber,c(NA,"NA")),function(x){ x2=x#gsub('"','',gsub("[",'',gsub("]",'',x,fixed=T),fixed=T),fixed=T) print(x) - sample<-subset(combined,CopyNumber==x) + sample<-subset(mt_data,CopyNumber==x) print(sample$improve_sample_id) res<-fread(synGet(x2)$path) diff --git a/build/mpnst/02_get_drug_data.R b/build/mpnst/02_get_drug_data.R index 7c07af9b..e90a31fb 100644 --- a/build/mpnst/02_get_drug_data.R +++ b/build/mpnst/02_get_drug_data.R @@ -103,7 +103,7 @@ if (!is.na(olddrugfiles)) { chem_name = character(), pubchem_id = character(), canSMILES = character(), - isoSMILES = character(), + # isoSMILES = character(), InChIKey = character(), formula = character(), weight = numeric(), @@ -118,7 +118,7 @@ if (!is.na(olddrugfiles)) { chem_name = character(), pubchem_id = character(), canSMILES = character(), - isoSMILES = character(), + # isoSMILES = character(), InChIKey = character(), formula = character(), weight = numeric(), diff --git a/build/mpnst/03_get_drug_response_data.R b/build/mpnst/03_get_drug_response_data.R index 8f02ca80..9bbb6f00 100644 --- a/build/mpnst/03_get_drug_response_data.R +++ b/build/mpnst/03_get_drug_response_data.R @@ -32,7 +32,7 @@ org_samps<-subset(samples_df,model_type=='organoid') ##now get the manifest from synapse manifest<-synapser::synTableQuery("select * from syn53503360")$asDataFrame()|> - as.data.frame()|> + as.data.table()|> dplyr::rename(common_name='Sample') diff --git a/build/mpnst/README.md b/build/mpnst/README.md index 10d0bcd9..13c08454 100755 --- a/build/mpnst/README.md +++ b/build/mpnst/README.md @@ -12,7 +12,7 @@ directory. Currently using the test files as input. `mpnst_samples.csv` file. This pulls from the latest synapse project metadata table. ``` - docker run -v $PWD:/tmp -e SYNAPSE_AUTH_TOKEN=$SYNAPSE_AUTH_TOKEN mpnst sh build_samples.sh /tmp/build/build_test/test_samples.csv + docker run -v $PWD:/tmp -e -e SYNAPSE_AUTH_TOKEN=$SYNAPSE_AUTH_TOKEN mpnst sh build_samples.sh /tmp/build/build_test/test_samples.csv ``` 3. Pull the data and map it to the samples. This uses the metadata diff --git a/build/mpnst/build_drugs.sh b/build/mpnst/build_drugs.sh index 3b969d2b..821bc46c 100644 --- a/build/mpnst/build_drugs.sh +++ b/build/mpnst/build_drugs.sh @@ -1,7 +1,7 @@ #!/bin/bash -set -euo pipefail +#set -euo pipefail -trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit 1' ERR +#trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit 1' ERR echo "Running 02_get_drug_data.R with /tmp/mpnst_drugs.tsv and $1." Rscript 02_get_drug_data.R /tmp/mpnst_drugs.tsv $1 diff --git a/build/mpnst/requirements.r b/build/mpnst/requirements.r index 72dd457d..7796236d 100755 --- a/build/mpnst/requirements.r +++ b/build/mpnst/requirements.r @@ -4,8 +4,8 @@ install.packages('remotes') remotes::install_version('rjson', version = '0.2.21', repos = 'https://cloud.r-project.org') install.packages('synapser', repos = c('http://ran.synapse.org', 'https://cloud.r-project.org')) install.packages("dplyr") -install.packages("data.table") install.packages("synapser", repos = c("http://ran.synapse.org", "https://cloud.r-project.org")) +install.packages("data.table") install.packages("R.utils") install.packages("stringr") -install.packages("tidyr") \ No newline at end of file +install.packages("tidyr") diff --git a/build/mpnstPDX/01_mpnstpdx_get_omics.R b/build/mpnstPDX/01_mpnstpdx_get_omics.R new file mode 100755 index 00000000..86e3cbb8 --- /dev/null +++ b/build/mpnstPDX/01_mpnstpdx_get_omics.R @@ -0,0 +1,195 @@ +# Load required libraries +library(data.table) +# library(biomaRt)# biomart issues still exist +library(synapser) +library(dplyr) + +# Retrieve command line arguments +args <- commandArgs(trailingOnly = TRUE) + +# Check if a token was provided +if (length(args) == 0) { + stop("No token or sample file provided. Usage: Rscript my_script.R [samples] [genes]", call. = FALSE) +} + +# Set your personal access token +PAT <- args[1] +patients <- args[2] +genefile <- args[3] + +# Log in to Synapse +synLogin(authToken = PAT) + +# Define the Ensembl mart # biomart issues still exist +# ensembl <- useMart("ensembl", dataset = "hsapiens_gene_ensembl") # biomart issues still exist; fix later... + +# Path to the directory to save .sf files +#path <- "./tmp" +#dir.create(path, showWarnings = FALSE) + +# Read the sample mapping CSV and genes.csv +samples_df <- fread(patients)|> + dplyr::select(improve_sample_id,common_name,model_type)|> + distinct()#"mpnst/synapse_NF-MPNSTpdx_samples.csv") + +pdx_samps<-subset(samples_df,model_type=='patient derived xenograft') +tumor_samps<-subset(samples_df,model_type=='tumor') + +##now get the manifest from synapse +manifest<-synapser::synTableQuery("select * from syn53503360")$asDataFrame()|> + as.data.frame()|> + dplyr::rename(common_name='Sample') + + +##for now we only have tumor and pdx data +##they each get their own sample identifier +pdx_data<-manifest|>dplyr::select(common_name,starts_with("PDX"))|> + left_join(pdx_samps)|> + dplyr::select(improve_sample_id,RNASeq='PDX_RNASeq',Mutations='PDX_Somatic_Mutations',CopyNumber='PDX_CNV',Proteomics='PDX_Proteomics') + +tumor_data<- manifest|>dplyr::select(common_name,starts_with("Tumor"))|> + left_join(tumor_samps)|> + dplyr::select(improve_sample_id,RNASeq='Tumor_RNASeq',Mutations='Tumor_Somatic_Mutations',CopyNumber='Tumor_CNV')|> + mutate(Proteomics='') ##we dont have tumor proteomics from these samples +#print(tumor_data) + + +pdx_data<-rbind(pdx_data,tumor_data)|>distinct() + +# gene mapping table +genes_df <- fread(genefile) + + +##added proteomics first +proteomics<-do.call('rbind',lapply(setdiff(pdx_data$Proteomics,c('',NA,"NA")),function(x){ + # if(x!=""){ + #print(x) + sample<-subset(pdx_data,Proteomics==x) + #print(sample) + res<-fread(synGet(x)$path)|> + #tidyr::separate(Name,into=c('other_id','vers'),sep='\\.')|> + #dplyr::select(-vers)|> + dplyr::rename(gene_symbol='Gene')|> + left_join(genes_df)|> + dplyr::select(entrez_id,proteomics='logRatio')|> + distinct()|> + subset(!is.na(entrez_id))|> + subset(proteomics!=0) + + res$improve_sample_id=rep(sample$improve_sample_id[1],nrow(res)) + res$source=rep('NF Data Portal',nrow(res)) + res$study=rep('MPNST PDX',nrow(res)) + return(distinct(res)) + # } +})) + +fwrite(proteomics,'/tmp/mpnstpdx_proteomics.csv.gz') + + +#### FIRST WE GET RNASeq Data + +rnaseq<-do.call('rbind',lapply(setdiff(pdx_data$RNASeq,c(NA,"NA")),function(x){ + # if(x!=""){ + #print(x) + sample<-subset(pdx_data,RNASeq==x) + #print(sample) + res<-fread(synGet(x)$path)|> + tidyr::separate(Name,into=c('other_id','vers'),sep='\\.')|> + dplyr::select(-vers)|> + left_join(genes_df)|> + dplyr::select(entrez_id,transcriptomics='TPM')|> + subset(!is.na(entrez_id))|> + subset(transcriptomics!=0) + + res$improve_sample_id=rep(sample$improve_sample_id[1],nrow(res)) + res$source=rep('NF Data Portal',nrow(res)) + res$study=rep('MPNST PDX',nrow(res)) + return(distinct(res)) + # } +})) + +fwrite(rnaseq,'/tmp/mpnstpdx_transcriptomics.csv.gz') + + + +#####NEXT WE DO WES DATA +print("Getting WES") +wes<-do.call(rbind,lapply(setdiff(pdx_data$`Mutations`,c(NA,"NA")),function(x){ + + x2=x#gsub('"','',gsub("[",'',gsub("]",'',x,fixed=T),fixed=T),fixed=T) + print(x) + sample<-subset(pdx_data,Mutations==x) + print(sample$improve_sample_id) + res<-NULL + try(res<-fread(synGet(x2)$path)|> + dplyr::select(entrez_id='Entrez_Gene_Id',mutation='HGVSc',variant_classification='Variant_Classification')|> + subset(entrez_id%in%genes_df$entrez_id)|> + distinct()) + if(is.null(res)) + return(NULL) + + res$improve_sample_id=rep(sample$improve_sample_id[1],nrow(res)) + res$source=rep('NF Data Portal',nrow(res)) + res$study=rep('MPNST PDX',nrow(res)) + + return(distinct(res)) + # } +})) + +fwrite(wes,'/tmp/mpnstpdx_mutations.csv.gz') + + +print(paste("getting CNV")) +##next let's do CNVs! +cnv<-do.call(rbind,lapply(setdiff(pdx_data$CopyNumber,c(NA,"NA")),function(x){ + + x2=x#gsub('"','',gsub("[",'',gsub("]",'',x,fixed=T),fixed=T),fixed=T) + print(x) + sample<-subset(pdx_data,CopyNumber==x) + print(sample$improve_sample_id) + res<-fread(synGet(x2)$path) + + long_df<- res|> + tidyr::separate_rows(gene,sep=',')|> + dplyr::rename(gene_symbol='gene')|> + dplyr::left_join(genes_df)|> + subset(!is.na(entrez_id))|> + dplyr::select(entrez_id,log2)|> + dplyr::distinct()|> + dplyr::mutate(copy_number=2^log2)|> + dplyr::select(-log2) + + res<-long_df|> ##deep del < 0.5210507 < het loss < 0.7311832 < diploid < 1.214125 < gain < 1.422233 < amp + dplyr::mutate(copy_call=ifelse(copy_number<0.5210507,'deep del', + ifelse(copy_number<0.7311832,'het loss', + ifelse(copy_number<1.214125,'diploid', + ifelse(copy_number<1.422233,'gain','amp')))))|> + mutate(study='MPNST PDX',source='NF Data Portal',improve_sample_id=sample$improve_sample_id[1])|> + dplyr::distinct() + + # long_df <- res[, strsplit(as.character(gene), ","), by = .(chromosome, start, end, depth, log2)] + # filtered_df <- long_df |> + # subset(is.finite(log2))|> + # filter(V1 %in% genes_df$gene) # get only protein coding genes and remove empty gene symbols + # filtered_df <- filtered_df[, .(gene_symbol = V1, + # improve_sample_id = sample$improve_sample_id[1], + # copy_number = 2^log2, + # source = "NF Data Portal", + # study = "MPNST PDX")] + # res<-filtered_df|> ##deep del < 0.5210507 < het loss < 0.7311832 < diploid < 1.214125 < gain < 1.422233 < amp + # dplyr::mutate(copy_call=ifelse(copy_number<0.5210507,'deep del', + # ifelse(copy_number<0.7311832,'het loss', + # ifelse(copy_number<1.214125,'diploid', + # ifelse(copy_number<1.422233,'gain','amp')))))|> + # left_join(genes_df)|> + # dplyr::select(entrez_id,improve_sample_id,copy_number,copy_call,study,source)|> + # subset(!is.na(entrez_id))|> + # distinct() + # res|>group_by(copy_call)|>summarize(n_distinct(entrez_id)) + return(res) + # } +})) + +fwrite(cnv,'/tmp/mpnstpdx_copy_number.csv.gz') + +##TODO: get proteomics!!! diff --git a/build/mpnstPDX/02_get_drug_data.R b/build/mpnstPDX/02_get_drug_data.R new file mode 100644 index 00000000..1f6ad47e --- /dev/null +++ b/build/mpnstPDX/02_get_drug_data.R @@ -0,0 +1,120 @@ +# Load required libraries +library(data.table) +# library(biomaRt)# biomart issues still exist +library(dplyr) +library(stringr) +library(reticulate) +library(synapser) +library(tidyr) + + +# Retrieve command line arguments +args <- commandArgs(trailingOnly = TRUE) + +# Check if a token was provided +if (length(args) == 0) { + stop("No token or sample file provided. Usage: Rscript my_script.R [olddrugfile] [newdrugfile]", call. = FALSE) +} + +# Set your personal access token +PAT <- args[1] +olddrugfiles <- args[2] +newdrugfile <- args[3] +# Log in to Synapse +synLogin(authToken = PAT) + + +##now get the manifest from synapse +manifest<-synapser::synTableQuery("select * from syn53503360")$asDataFrame()|> + as.data.frame()|> + dplyr::rename(common_name='Sample') + + +##PDX contain list of files +pdx<-manifest|> + dplyr::select(common_name,PDX_Drug_Data)|> + distinct()|> + subset(!is.na(PDX_Drug_Data)) + + + + + +##define functions + +#print(pdx) +##now loop through manifest to get all the files +pdx_fold <- data.table(pdx)[,strsplit(as.character(PDX_Drug_Data),","), by = .(common_name)]|> + subset(!is.na(V1))|> + subset(V1!='NA')|> + dplyr::rename(id='V1') + +#print(pdx_fold) +###this is not all of themju +pdx_meta<-do.call(rbind,lapply(pdx_fold$id, function(x) synapser::synGetAnnotations(x)|> + as.data.frame()|> + dplyr::select('experimentalCondition')|> + dplyr::mutate(id=x)))|> + left_join(pdx_fold)|> + tidyr::separate_rows(experimentalCondition,sep=';')|> + mutate(chem_name=tolower(experimentalCondition)) + +#pdx_drug <- data.table(pdx_meta)[,strsplit(as.character(experimentalCondition),';'),by= .(common_name,id)]|> +# mutate(drug=tolower(experimentalCondition)) +#drugs<-sapply(pdx_meta$experimentalCondition,function(x) tolower(unlist(strsplit(x,split=';'))))|> +# unlist()|> +# unique() + +drugs<-setdiff(pdx_meta$chem_name,'control') + + +print(paste(drugs,collapse=',')) + + +##copy old drug to new drug +olddrugs<-do.call(rbind,lapply(unique(unlist(strsplit(olddrugfiles,split=','))),function(x) read.table(x,header=T,sep='\t',quote='',comment.char=''))) +olddrugs<-unique(olddrugs) + +print(paste('Read in ',nrow(olddrugs),'old drug files')) + +fdrugs<-subset(olddrugs,chem_name%in%drugs) +if(nrow(fdrugs)>0){ + dids<-fdrugs$improve_drug_id +}else{ + dids<-c() +} +newdrugs<-subset(olddrugs,improve_drug_id%in%dids) + +print(paste('Found',length(dids),'improved drug ids that exist, saving those')) + + + #file.copy(olddrugfile,newdrugfile) +write.table(newdrugs,file=newdrugfile,sep='\t',row.names=F,quote=FALSE,col.names=T) +output_file_path <- newdrugfile +ignore_file_path <- '/tmp/mpnstpdx_ignore_chems.txt' + + +##now load reticulate down here + + + +use_python("/opt/venv/bin/python3", required = TRUE) +source_python("pubchem_retrieval.py") + +update_dataframe_and_write_tsv(unique_names=drugs,output_filename=output_file_path,ignore_chems=ignore_file_path) + + +tab<-read.table(newdrugfile,sep='\t',header=T,quote="",comment.char="") + +newdrugs<-tab|> + subset(chem_name%in%tolower(alldrugs)) + +tab<-tab|> + subset(improve_drug_id%in%newdrugs$improve_drug_id) + +write.table(tab,file=newdrugfile,sep='\t',row.names=FALSE,quote=FALSE) + + +##now call the python drug script + + diff --git a/build/mpnstPDX/03_get_drug_response_data.R b/build/mpnstPDX/03_get_drug_response_data.R new file mode 100644 index 00000000..095dba34 --- /dev/null +++ b/build/mpnstPDX/03_get_drug_response_data.R @@ -0,0 +1,174 @@ +# Load required libraries +library(data.table) +# library(biomaRt)# biomart issues still exist +library(synapser) +library(dplyr) +library(stringr) +# Retrieve command line arguments +args <- commandArgs(trailingOnly = TRUE) + +# Check if a token was provided +if (length(args) == 0) { + stop("No token or sample file provided. Usage: Rscript my_script.R [samples] [drugs]", call. = FALSE) +} + +# Set your personal access token +PAT <- args[1] +patients <- args[2] +drugfile <- args[3] + +# Log in to Synapse +synLogin(authToken = PAT) + + +# Read the sample mapping CSV and genes.csv +samples_df <- fread(patients)|> + dplyr::select(improve_sample_id,common_name,model_type)|> + distinct()#"mpnst/synapse_NF-MPNST_samples.csv") +print(head(samples_df)) + +pdx_samps<-subset(samples_df,model_type=='patient derived xenograft') +org_samps<-subset(samples_df,model_type=='organoid') + +##now get the manifest from synapse +manifest<-synapser::synTableQuery("select * from syn53503360")$asDataFrame()|> + as.data.frame()|> + dplyr::rename(common_name='Sample') + + +##PDX contain list of files +pdx<-manifest|> + dplyr::select(common_name,PDX_Drug_Data)|> + subset(!PDX_Drug_Data%in%c("NA",NA))|> + left_join(pdx_samps)|> + distinct() + +print(pdx) + + +# Modify the extract_date_hour function to return a named vector +extract_date_hour <- function(experiment_id) { + pattern <- "(\\d{6})_?(\\d{2,3})?" + matches <- str_match(experiment_id, pattern) + date <- matches[, 2] + hour <- matches[, 3] + date[is.na(date)] <- NA # Replace with NA instead of blank + hour[is.na(hour)] <- 48 # Replace with 48 instead of blank (default) + return(list(date = date, hour = hour)) +} + + + +##define functions + +##first function to get children from parentId + +##now loop through manifest to get all the files +#mts_fold <- data.table(mts)[,strsplit(as.character(MicroTissueDrugFolder),","), by = .(improve_sample_id,common_name)] + + + +##do the drug matching +drug_df<-fread(drugfile)|> + dplyr::select('improve_drug_id','chem_name')|> + distinct() + +##update drug name PD901 since it's mussing +##now loop through manifest to get all the files +pdx_fold <- data.table(pdx)[,strsplit(as.character(PDX_Drug_Data),","), by = .(common_name)]|> + dplyr::rename(id='V1')|> + subset(!is.na(id)) + +pdx_meta<-do.call(rbind,lapply(pdx_fold$id, function(x) synapser::synGetAnnotations(x)|> + as.data.frame()|> + dplyr::select('experimentalCondition')|> + dplyr::mutate(id=x)))|>left_join(pdx_fold)|> + # tidyr::separate_rows(experimentalCondition,sep=';')|> + # mutate(chem_name=tolower(experimentalCondition))|> + # left_join(drug_df)|> + left_join(pdx_samps)|> + dplyr::select(improve_sample_id,id)|> + distinct()|> + subset(!is.na(id)) +pdx_meta$parentId=unlist(lapply(pdx_meta$id,function(x) synGet(x)$parentId)) + +##the older pdx data is in separate files. the newer is not. +#we need to reformat the older to look like the newer +oldfolders=c('syn22018363','syn22024460','syn22024428','syn22024429','syn22024437','syn22024438') + +old_meta<-subset(pdx_meta,parentId%in%oldfolders) + +old_data<-do.call(rbind,lapply(unique(old_meta$parentId),function(x){ + ids<-subset(old_meta,parentId==x)|> + subset(!is.na(id)) + + do.call(rbind,lapply(ids$id,function(y){ + tab<-readr::read_csv(synapser::synGet(y)$path) + print(head(tab)) + tab<-dplyr::select(tab,c('specimen_id','compound_name','dose','dose_unit', + 'experimental_time_point','experimental_time_point_unit', + 'assay_type','assay_value','assay_units'))|> + mutate(id=x)|> + mutate(chem_name=tolower(compound_name)) + + # tab$single_or_combo=sapply(tab$chem_name,function(z) ifelse(length(grep('\\+',z))>0,'combo','single')) + tab$chem_name=gsub('n/a','control',tab$chem_name)|> + tidyr::replace_na('control') + + tab$chem_name=sapply(tab$chem_name,function(z) ifelse(z=='doxorubinsin','doxorubicin',z)) + # tab<-tab|>left_join(drug_df) + #print(head(tab)) + return(tab) + })) +}))|> + left_join(unique(select(old_meta,id=parentId,improve_sample_id)))|> + dplyr::select(experiment=id,model_id=improve_sample_id,specimen_id,treatment=chem_name,time=experimental_time_point,time_unit=experimental_time_point_unit,volume=assay_value)|>distinct() + + + +new_meta<-subset(pdx_meta,!parentId%in%oldfolders) + +##now combine each of the old pdx files into single files +#each file has all experiments in it +new_data<-do.call(rbind,lapply(unique(new_meta$id), function(x){ + fpath=synapser::synGet(x)$path + if(length(grep('xls',fpath))>0){ + tab<-readxl::read_xlsx(fpath) + }else{ + tab<-readr::read_csv(fpath) + } + print(head(tab)) + tab<-dplyr::select(tab,c('specimen_id','compound_name','dose','dose_unit', + 'experimental_time_point','experimental_time_point_unit', + 'assay_type','assay_value','assay_units'))|> + mutate(id=x) + + # tab$single_or_combo=sapply(tab$compound_name,function(x) ifelse(length(grep('\\+',x))>0,'combo','single')) + tab$compound_name=gsub('N/A','control',tab$compound_name)|>tidyr::replace_na('control') + tab<-tab|> + mutate(chem_name=tolower(compound_name))#|> + # left_join(drug_df) + #print(head(tab)) + return(tab)}))|> + left_join(pdx_meta)|> + dplyr::select(experiment=id,model_id=improve_sample_id,specimen_id,treatment=chem_name,time=experimental_time_point,time_unit=experimental_time_point_unit,volume=assay_value)|>distinct() + +##maybe tweak the data frame a bit depending on curve fitting script +pdx_data<-rbind(old_data,new_data) + +#single_pdx<-subset(pdx_data,single_or_combo=='single') +#combo_pdx<-subset(pdx_data,single_or_combo=='combo') +#print(head(pdx_data)) +fwrite(pdx_data,'/tmp/curve_data.tsv',sep='\t') + +##TODO: create new curve fitting script in python +pycmd = '/opt/venv/bin/python calc_pdx_metrics.py --input /tmp/curve_data.tsv --outprefix /tmp/mpnstpdx' +print('running curve fitting') +#system(pycmd) + +##now read in data again, separate out by single/combo, then map to drug id + +##mmve file name +#file.rename('/tmp/experiments.0','/tmp/mpnstpdx_experiments.tsv') + + diff --git a/build/mpnstPDX/README.md b/build/mpnstPDX/README.md new file mode 100755 index 00000000..b0059283 --- /dev/null +++ b/build/mpnstPDX/README.md @@ -0,0 +1,47 @@ +## Build Instructions for MPNST PDX Dataset + +To build the MPNST PDX dataset, follow these steps from the coderdata root +directory. Currently using the test files as input. + +1. Build the Docker image: + ``` + docker build -f build/docker/Dockerfile.mpnstpdx -t mpnstpdx . --build-arg HTTPS_PROXY=$HTTPS_PROXY + ``` + +2. Generate new identifiers for these samples to create a + `mpnstpdx_samples.csv` file. This pulls from the latest synapse + project metadata table. + ``` + docker run -v $PWD:/tmp -e SYNAPSE_AUTH_TOKEN=$SYNAPSE_AUTH_TOKEN mpnstpdx sh build_samples.sh /tmp/build/build_test/test_samples.csv + ``` + +3. Pull the data and map it to the samples. This uses the metadata + table pulled above. + ``` + docker run -v $PWD:/tmp -e SYNAPSE_AUTH_TOKEN=$SYNAPSE_AUTH_TOKEN mpnstpdx sh build_omics.sh /tmp/build/build_test/test_genes.csv /tmp/mpnstpdx_samples.csv + ``` + +4. Process drug data + ``` + docker run -v $PWD:/tmp -e SYNAPSE_AUTH_TOKEN=$SYNAPSE_AUTH_TOKEN mpnstpdx sh build_drugs.sh /tmp/build/build_test/test_drugs.tsv + ``` + +5. Process experiment data. This uses the metadata from above as well as the file directory on synapse: + ``` + docker run -v $PWD:/tmp -e SYNAPSE_AUTH_TOKEN=$SYNAPSE_AUTH_TOKEN mpnstpdx sh build_exp.sh /tmp/mpnstpdx_samples.csv /tmp/mpnstpdx_drugs.tsv.gz + ``` + +Please ensure that each step is followed in order for correct dataset compilation. + +## MPNST PDX Dataset Structure +The MPNST dataset includes the following output files: +``` +├── mpnstpdx_samples.csv +├── mpnstpdx_transcriptomics.csv +├── mpnstpdx_mutations.csv +├── mpnstpdx_copy_number.csv +├── mpnstpdx_drugs.tsv +├── mpnstpdx_drug_descriptors.tsv.gz +├── mpnstpdx_experiments.tsv.gz +``` + diff --git a/build/mpnstPDX/build_drugs.sh b/build/mpnstPDX/build_drugs.sh new file mode 100644 index 00000000..78502bc7 --- /dev/null +++ b/build/mpnstPDX/build_drugs.sh @@ -0,0 +1,4 @@ +##get drug data +Rscript 02_get_drug_data.R $SYNAPSE_AUTH_TOKEN $1 /tmp/mpnstpdx_drugs.tsv +##get drug descriptors +/opt/venv/bin/python3 build_drug_desc.py --drugtable /tmp/mpnstpdx_drugs.tsv --desctable /tmp/mpnstpdx_drug_descriptors.tsv.gz \ No newline at end of file diff --git a/build/mpnstPDX/build_exp.sh b/build/mpnstPDX/build_exp.sh new file mode 100644 index 00000000..4e34f6b3 --- /dev/null +++ b/build/mpnstPDX/build_exp.sh @@ -0,0 +1,2 @@ +Rscript 03_get_drug_response_data.R $SYNAPSE_AUTH_TOKEN $1 $2 +/opt/venv/bin/python3 calc_pdx_metrics.py /tmp/curve_data.tsv --drugfile=/tmp/mpnstpdx_drugs.tsv --outprefix=/tmp/mpnstpdx diff --git a/build/mpnstPDX/build_omics.sh b/build/mpnstPDX/build_omics.sh new file mode 100644 index 00000000..969b4fba --- /dev/null +++ b/build/mpnstPDX/build_omics.sh @@ -0,0 +1,7 @@ +#!/bin/bash +set -euo pipefail + +trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit 1' ERR + +echo "Running 01_mpnstpdx_get_omics.R with $SYNAPSE_AUTH_TOKEN, $2, and $1." +Rscript 01_mpnstpdx_get_omics.R $SYNAPSE_AUTH_TOKEN $2 $1 diff --git a/build/mpnstPDX/build_samples.sh b/build/mpnstPDX/build_samples.sh new file mode 100644 index 00000000..aa88aa02 --- /dev/null +++ b/build/mpnstPDX/build_samples.sh @@ -0,0 +1 @@ +cp /tmp/mpnst_samples.csv /tmp/mpnstpdx_samples.csv diff --git a/build/mpnstPDX/requirements.r b/build/mpnstPDX/requirements.r new file mode 100755 index 00000000..e6139cd4 --- /dev/null +++ b/build/mpnstPDX/requirements.r @@ -0,0 +1,13 @@ +install.packages('reticulate', repos='https://cloud.r-project.org') +reticulate::use_virtualenv('/opt/venv', required = TRUE) +install.packages('remotes') +remotes::install_version('rjson', version = '0.2.21', repos = 'https://cloud.r-project.org') +install.packages('synapser', repos = c('http://ran.synapse.org', 'https://cloud.r-project.org')) +install.packages("dplyr") +install.packages("data.table") +install.packages("synapser", repos = c("http://ran.synapse.org", "https://cloud.r-project.org")) +install.packages("R.utils") +install.packages("stringr") +install.packages("tidyr") +install.packages('readr') +install.packages("readxl") diff --git a/build/mpnstPDX/requirements.txt b/build/mpnstPDX/requirements.txt new file mode 100755 index 00000000..b0944928 --- /dev/null +++ b/build/mpnstPDX/requirements.txt @@ -0,0 +1,12 @@ +pyarrow +pandas +matplotlib +numpy==1.26.4 +argparse +tqdm +scikit-learn +scipy +requests +mordredcommunity +rdkit +statsmodels diff --git a/build/utils/build_drug_desc.py b/build/utils/build_drug_desc.py index 7e25212e..f119bc77 100644 --- a/build/utils/build_drug_desc.py +++ b/build/utils/build_drug_desc.py @@ -82,8 +82,9 @@ def main(): cansmiles = [a for a in set(tab.canSMILES) if str(a)!='nan'] # isosmiles = list(set(tab.isoSMILES)) morgs = smiles_to_fingerprint(cansmiles) - +# print(morgs) ids = pd.DataFrame(tab[['improve_drug_id','canSMILES']]).drop_duplicates() +# print(ids) id_morg = ids.rename({"canSMILES":'smile'},axis=1).merge(morgs)[['improve_drug_id','structural_descriptor','descriptor_value']] mords = smiles_to_mordred(cansmiles,nproc=ncors) diff --git a/build/utils/calc_pdx_metrics.py b/build/utils/calc_pdx_metrics.py new file mode 100755 index 00000000..c1a32d26 --- /dev/null +++ b/build/utils/calc_pdx_metrics.py @@ -0,0 +1,400 @@ +''' +python script designed to compute pdx curve metrics from file +''' + + +from argparse import * +import pandas as pd +from tqdm import tqdm +import numpy as np + +import statsmodels.api as sm +from statsmodels.formula.api import mixedlm + + +####mRECIST code +def tumor_volume_change(volume): + """ + Calculate the percent change in tumor volume relative to the initial volume. + + Parameters: + volume (np.ndarray or list): Array of tumor volumes. + + Returns: + np.ndarray: Array of percent changes in tumor volume. + """ + Vini = volume[0] + return np.array([100 * (vt - Vini) / Vini for vt in volume]) + +def avg_response(volume_change): + """ + Calculate the average response for each time point. + + Parameters: + volume_change (np.ndarray or list): Array of volume percent changes. + + Returns: + np.ndarray: Array of average responses. + """ + cumsum = np.cumsum(volume_change) + ar = cumsum / np.arange(1, len(volume_change) + 1) + return ar + +def check_numeric_int_char_zero(u): + """ + Check if the input array is empty and return NA if true. + + Parameters: + u (any): Input value or array. + + Returns: + The input value or None if it is empty. + """ + return None if len(u) == 0 else u + +def get_best_response(time, response, min_time=None): + """ + Determine the best response after a specified minimum time. + + Parameters: + time (np.ndarray or list): Array of time points. + response (np.ndarray or list): Array of response values. + min_time (float or None): Minimum time after which tumor volume will be considered. + + Returns: + dict: Dictionary containing the best response time, value, and index. + """ + rtz = {"time": None, "value": None, "index": None} + + exdf = pd.DataFrame({"time": time, "response": response}) + exdf = exdf.dropna() + exdf.reset_index() + if min_time is not None: + exdf_a = exdf[exdf['time'] >= min_time] + + if exdf.shape[0] == 0: + return rtz + exdf_a.reset_index() + # print(exdf.shape) + min_indx = list(exdf.response).index(min(exdf_a.response)) + + rtz['time'] = check_numeric_int_char_zero([list(exdf.time)[min_indx]]) + rtz['value'] = check_numeric_int_char_zero([list(exdf.response)[min_indx]]) + rtz['index'] = check_numeric_int_char_zero([min_indx]) + return rtz + +def mrecist(time, volume, min_time=10, return_detail=False): + """ + Compute the mRECIST for given volume response. + + Parameters: + time (np.ndarray or list): Array of time points. + volume (np.ndarray or list): Array of tumor volumes. + min_time (float): Minimum time after which tumor volume will be considered. + return_detail (bool): If True, return all intermediate values. + + Returns: + str or dict: mRECIST classification or a dictionary with detailed intermediate values. + """ + if volume[0] == 0: + volume = np.array(volume) + 1 + + exdf = { + "volume_change": None, + "average_response": None, + "best_response": None, + "best_response)time": None, + "best_average_response": None, + "best_average_response_time": None, + "mRECIST": None + } + + exdf["volume_change"] = tumor_volume_change(volume) + exdf["average_response"] = avg_response(exdf["volume_change"]) + nxdf = {'metric':'mRESCIST','value':None} + + df = pd.DataFrame({"time": time, "volume": volume}) + df = df[df['time'] >= min_time] + + if df.shape[0] < 2: + print(f"Warning: insufficient data after time {min_time}") + else: + br = get_best_response(time, exdf["volume_change"], min_time) + exdf["best_response"] = br["value"] + exdf["best_response_time"] = br["time"] + + bar = get_best_response(time, exdf["average_response"], min_time) + exdf["best_average_response"] = bar["value"] + exdf["best_average_response_time"] = bar["time"] + + best_response = exdf["best_response"][0] + best_average_response = exdf["best_average_response"][0] + + mrecist = None + nxdf = {'metric':'mRESCIST','value':mrecist} +# exdf["mRECIST"] = mrecist + + if best_response is not None and best_average_response is not None: + # Order of mRECIST assignment matters + mrecist = "PD" + + if best_response < 35 and best_average_response < 30: + mrecist = "SD" + + if best_response < -50 and best_average_response < -20: + mrecist = "PR" + + if best_response < -95 and best_average_response < -40: + mrecist = "CR" + +# exdf["mRECIST"] = mrecist + nxdf = {'metric':'mRESCIST','value':mrecist} + if not return_detail: + return nxdf + + return nxdf + +###AUC CODE + +def trapz_auc(x, y): + """ + Compute the area under the curve using trapezoidal rule. + + Parameters: + x (np.ndarray): Array of x values. + y (np.ndarray): Array of y values. + + Returns: + float: Area under the curve. + """ + n = np.arange(1, len(x)) + auc = np.dot((x[n] - x[n - 1]), (y[n] + y[n - 1])) / 2 + return auc + +def AUC(time, volume, time_normalize=True): + """ + Calculate the area under the curve (AUC). + + Parameters: + time (np.ndarray): Array of time points recorded for the experiment. + volume (np.ndarray): Array of volumes corresponding to the time points. + time_normalize (bool): If True, AUC value will be divided by max time. + + Returns: + dict: Dictionary containing the AUC value. + """ + auc = trapz_auc(time, volume) + #print(time) + if time_normalize: + auc = auc/np.max(time) + return {"metric": "auc", "value": auc, 'time':np.max(time)} + +def TGI(contr_volume, treat_volume,time): + """ + Computes the tumor growth inhibition (TGI) between two time-volume curves. + + Parameters: + contr_volume (np.ndarray or list): Volume vector for control. + treat_volume (np.ndarray or list): Volume vector for treatment. + + Returns: + dict: Dictionary containing the TGI value. + """ + tgi = None + if len(contr_volume) > 0 and len(treat_volume) > 0: + tgi = (contr_volume[-1] - treat_volume[-1]) / (contr_volume[0] - treat_volume[0]) + + # Simulated batch response class object + rtx = { + "metric": "TGI", + "value": tgi, + 'time': np.max(time) + } + return rtx + + +def ABC(contr_time=None, contr_volume=None, treat_time=None, treat_volume=None): + """ + Compute the area between two time-volume curves. + + Parameters: + contr_time (np.ndarray): Array of time points for control. + contr_volume (np.ndarray): Array of control volumes. + treat_time (np.ndarray): Array of time points for treatment. + treat_volume (np.ndarray): Array of treatment volumes. + + Returns: + dict: Dictionary containing the area between curves. + """ + con = {'name': 'auc', 'value': None} + tre = {'name': 'auc', 'value': None} + abc = None + + if contr_time is not None and contr_volume is not None: + if len(contr_time) != len(contr_volume): + raise ValueError("contr.time and contr.volume should have same length") + con = AUC(contr_time, contr_volume) + + if treat_time is not None and treat_volume is not None: + if len(treat_time) != len(treat_volume): + raise ValueError("treat.time and treat.volume should have same length") + tre = AUC(treat_time, treat_volume) + + abc = con['value'] - tre['value'] + return {"metric": "abc", "value": abc,'time':np.max(treat_time)}#, "control": con, "treatment": tre} + + +###LMM CODE +def lmm(time, volume, treatment, drug_name): + """ + Compute the linear mixed model (lmm) statistics for a PDX batch. + + Parameters: + data (pd.DataFrame): A DataFrame containing the batch data. Must contain the columns: model_id, volume, time, exp_type. + + Returns: + dict: A dictionary containing the fit object and the specific coefficient value. + """ + + data = pd.DataFrame({'model_id':['model']*len(time),\ + 'volume':volume,\ + 'time':time,\ + 'exp_type':treatment}) + + data = data.dropna() + + ##create data frame from these 4 vectors + required_columns = ["model_id", "volume", "time", "exp_type"] + + if not all(column in data.columns for column in required_columns): + raise ValueError("These columns must be present: 'model_id', 'volume', 'time', 'exp_type'") + + data['log_volume'] = np.log(data['volume']) + + # Define the formula for mixed linear model + formula = 'log_volume ~ time*exp_type' + + # Fit the model + model = mixedlm(formula, data, groups=data['model_id']) + fit = model.fit() + + # Get the coefficient for the interaction term 'time:exp_type' + #interaction_term = 'time:exp_type' +# if interaction_term in fit.params: +# time_coef_value = fit.params['time'] + #print(fit.params) + i_coef_value = fit.params['time:exp_type[T.'+drug_name+']'] + # else: + # coef_value = None # Handle the case when the interaction term is not present + + # Create a dictionary to store the fit object and the specific coefficient value + result = { + 'fit': fit, + 'interaction_coef_value': i_coef_value + } + + return {'metric': 'lmm','value': i_coef_value,'time':np.max(time)} + +def main(): + parser=ArgumentParser() + ###read in file with model id, volume, time, condition + parser.add_argument('curvefile') + parser.add_argument('--drugfile') + parser.add_argument('--outprefix',default='/tmp/') + + args = parser.parse_args() + + tab = pd.read_csv(args.curvefile,sep='\t') + drugs = pd.read_csv(args.drugfile,sep='\t') + + singles, combos = get_drug_stats(tab) + + ##join with drug ids + expsing = singles.rename({'drug':'chem_name','metric':'drug_combination_metric','value':'drug_combination_value','sample':'improve_sample_id'},axis=1).merge(drugs,on='chem_name',how='left')[['improve_drug_id','improve_sample_id','drug_combination_metric','drug_combination_value']] + expsing = expsing.dropna() + + combos[['drug1','drug2']]=combos.drug.str.split('+',expand=True) + combos = combos.rename({'metric':'drug_combination_metric','value':'drug_combination_value','sample':'improve_sample_id'},axis=1).dropna() + + expcomb = combos.rename({'drug1':'chem_name'},axis=1).merge(drugs,on='chem_name',how='left').rename({'improve_drug_id':'improve_drug_1'},axis=1)[['improve_drug_1','drug2','improve_sample_id','drug_combination_metric','drug_combination_value']] + expcomb = expcomb.rename({'drug2':'chem_name'},axis=1).merge(drugs,on='chem_name',how='left').rename({'improve_drug_id':'improve_drug_2'},axis=1)[['improve_drug_1','improve_drug_2','improve_sample_id','drug_combination_metric','drug_combination_value']] + + expcomb[['source']]='Synapse' + expcomb[['study']]='MPNST PDX in vivo' + + expsing[['source']]='Synapse' + expsing[['study']]='MPNST PDX in vivo' + expsing.to_csv(args.outprefix+'_experiments.csv',index=False) + expcomb.to_csv(args.outprefix+'_combinations.csv',index=False) + + + +def get_drug_stats(df,control='control'): + ##for each experiment, call group + cols = ['experiment','model_id'] + groups = df.groupby(cols) + singleres = [] + combores = [] + + for name,group in tqdm(groups): + #each group contains multiple treatments anda control + drugs = set(group.treatment)-set([control]) + print(name[0]) + print(drugs) + mod = list(set(group.model_id))[0] + # print(set(group.model_id)) + ctl_data = group[group.treatment==control] + ctl_time = np.array(ctl_data.time) + ctl_volume = np.array(ctl_data.volume) + + ctl_auc = AUC(ctl_time,ctl_volume) + for d in drugs: + print(d) + d_data = group[group.treatment==d] + treat_time = np.array(d_data.time) + treat_volume = np.array(d_data.volume) + + #get abc for group + treat_auc = AUC(treat_time,treat_volume) + treat_abc = ABC(ctl_time,ctl_volume,treat_time,treat_volume) + #print(f"AUC: {treat_auc}") + #print(f"ABC: {treat_abc}") + treat_abc.update({'sample':mod,'drug':d,'time_unit':'days'}) + if '+' in d: + combores.append(treat_abc) + else: + singleres.append(treat_abc) + #lmm + comb = pd.concat([ctl_data,d_data]) + lmm_res = lmm(comb.time, comb.volume, comb.treatment,d) + lmm_res.update({'sample':mod,'drug':d,'time_unit':'days'}) + #print(f"LMM: {lmm_res}") + if '+' in d: + combores.append(lmm_res) + else: + singleres.append(lmm_res) + + #get tgi for group + tg = TGI(ctl_volume,treat_volume,treat_time) + tg.update({'sample':mod,'drug':d,'time_unit':'days'}) + #print(tg) + if '+' in d: + combores.append(tg) + else: + singleres.append(tg) + + + #get mRECIST for group + mr = mrecist(treat_time,treat_volume) + mr.update({'sample':mod,'drug':d,'time_unit':'days'}) + if '+' in d: + combores.append(mr) + else: + singleres.append(mr) + + sing = pd.DataFrame.from_records(singleres) + comb = pd.DataFrame.from_records(combores) + return sing,comb + +if __name__=='__main__': + main() diff --git a/build/utils/pubchem_retrieval.py b/build/utils/pubchem_retrieval.py index e6bdc836..1cf646af 100644 --- a/build/utils/pubchem_retrieval.py +++ b/build/utils/pubchem_retrieval.py @@ -54,12 +54,12 @@ def retrieve_drug_info(compound,ignore_chems,isname=True): if isname: urls = { - "properties": f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{compound}/property/CanonicalSMILES,IsomericSMILES,InChIKey,MolecularFormula,MolecularWeight/JSON", + "properties": f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{compound}/property/CanonicalSMILES,InChIKey,MolecularFormula,MolecularWeight/JSON", "synonyms": f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{compound}/synonyms/JSON" } else: urls = { - "properties": f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/CID/{compound}/property/CanonicalSMILES,IsomericSMILES,InChIKey,MolecularFormula,MolecularWeight/JSON", + "properties": f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/CID/{compound}/property/CanonicalSMILES,InChIKey,MolecularFormula,MolecularWeight/JSON", "synonyms": f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/CID/{compound}/synonyms/JSON" } @@ -188,9 +188,9 @@ def update_dataframe_and_write_tsv(unique_names, output_filename="drugs.tsv",ign mode = 'a' if file_exists else 'w' with open(output_filename, mode) as f: if not file_exists: - f.write("improve_drug_id\tchem_name\tpubchem_id\tcanSMILES\tisoSMILES\tInChIKey\tformula\tweight\n") + f.write("improve_drug_id\tchem_name\tpubchem_id\tcanSMILES\tInChIKey\tformula\tweight\n") for entry in data: - f.write(f"{entry['improve_drug_id']}\t{entry['name']}\t{entry.get('CID', '')}\t{entry['CanonicalSMILES']}\t{entry.get('IsomericSMILES', '')}\t{entry['InChIKey']}\t{entry['MolecularFormula']}\t{entry['MolecularWeight']}\n") + f.write(f"{entry['improve_drug_id']}\t{entry['name']}\t{entry.get('CID', '')}\t{entry['CanonicalSMILES']}\t{entry['InChIKey']}\t{entry['MolecularFormula']}\t{entry['MolecularWeight']}\n") with open(ignore_chems,"a") as ig_f: for entry in data: diff --git a/schema/coderdata.yaml b/schema/coderdata.yaml index 3f9dafde..ac393a35 100755 --- a/schema/coderdata.yaml +++ b/schema/coderdata.yaml @@ -188,13 +188,13 @@ classes: - source - study attributes: - drug_one_id: + drug_drug_2: description: improve_drug_id of first drug - drug_two_id: + improve_drug_2: description: imrrove_drug_id of second drug - combination_metric: + drug_combination_metric: description: metric calculated for synergy, or other metric of two drugs - combination_value: + drug_combination_value: description: value of metric for synergy or combination enums: ResponseMetric: diff --git a/schema/expected_files.yaml b/schema/expected_files.yaml index 6604b094..91f9770e 100644 --- a/schema/expected_files.yaml +++ b/schema/expected_files.yaml @@ -32,11 +32,29 @@ datasets: file: /tmp/mpnst_proteomics.csv - target_class: Mutations file: /tmp/mpnst_mutations.csv + - target_class: Copy Number + file: /tmp/mpnst_copy_number.csv - target_class: Experiments file: /tmp/mpnst_experiments.tsv - target_class: Drug file: /tmp/mpnst_drugs.tsv + mpnstpdx: + - target_class: Sample + file: /tmp/mpnstpdx_samples.csv + - target_class: Transcriptomics + file: /tmp/mpnstpdx_transcriptomics.csv + - target_class: Proteomics + file: /tmp/mpnstpdx_proteomics.csv + - target_class: Mutations + file: /tmp/mpnstpdx_mutations.csv + - target_class: Copy Number + file: /tmp/mpnstpdx_copy_number.csv + - target_class: Experiments + file: /tmp/mpnstpdx_experiments.tsv + - target_class: Drug + file: /tmp/mpnstpdx_drugs.tsv + cptac: - target_class: Sample file: /tmp/cptac_samples.csv