Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
131 changes: 71 additions & 60 deletions build/beatAML/GetBeatAML.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,25 +9,25 @@
import argparse
import time

def download_from_github(raw_url, save_path):
"""
Download a file from a raw GitHub URL and save to the specified path.

Parameters
----------
raw_url : str
The raw GitHub URL pointing to the file to be downloaded.
save_path : str
The local path where the downloaded file will be saved.

Returns
-------
None
"""
response = requests.get(raw_url)
with open(save_path, 'wb') as f:
f.write(response.content)
return
# def download_from_github(raw_url, save_path):
# """
# Download a file from a raw GitHub URL and save to the specified path.

# Parameters
# ----------
# raw_url : str
# The raw GitHub URL pointing to the file to be downloaded.
# save_path : str
# The local path where the downloaded file will be saved.

# Returns
# -------
# None
# """
# response = requests.get(raw_url)
# with open(save_path, 'wb') as f:
# f.write(response.content)
# return

def retrieve_figshare_data(url):
"""
Expand Down Expand Up @@ -178,14 +178,14 @@ def retrieve_drug_info(compound_name):
properties = data["PropertyTable"]["Properties"][0]
pubchem_id = properties.get('CID',np.nan)
canSMILES = properties.get("CanonicalSMILES", np.nan)
isoSMILES = properties.get("IsomericSMILES", np.nan)
# isoSMILES = properties.get("IsomericSMILES", np.nan)
InChIKey = properties.get("InChIKey", np.nan)
formula = properties.get("MolecularFormula", np.nan)
weight = properties.get("MolecularWeight", np.nan)

return pubchem_id, canSMILES, isoSMILES, InChIKey, formula, weight
return pubchem_id, canSMILES, InChIKey, formula, weight
else:
return np.nan, np.nan, np.nan, np.nan, np.nan, np.nan
return np.nan, np.nan, np.nan, np.nan, np.nan


def update_dataframe_with_pubchem(d_df):
Expand Down Expand Up @@ -230,14 +230,14 @@ def update_dataframe_with_pubchem(d_df):
if row['chem_name'] in data_dict and not all(pd.isna(val) for val in data_dict[row['chem_name']]):
values = data_dict[row['chem_name']]
else:
values = data_dict.get(row['other_name'], (np.nan, np.nan, np.nan, np.nan, np.nan, np.nan))
values = data_dict.get(row['other_name'], (np.nan, np.nan, np.nan, np.nan, np.nan))

d_df.at[idx, 'pubchem_id'] = values[0]
d_df.at[idx, "canSMILES"] = values[1]
d_df.at[idx, "isoSMILES"] = values[2]
d_df.at[idx, "InChIKey"] = values[3]
d_df.at[idx, "formula"] = values[4]
d_df.at[idx, "weight"] = values[5]
# d_df.at[idx, "isoSMILES"] = values[2]
d_df.at[idx, "InChIKey"] = values[2]
d_df.at[idx, "formula"] = values[3]
d_df.at[idx, "weight"] = values[4]

return d_df

Expand All @@ -250,24 +250,24 @@ def merge_drug_info(d_df,drug_map):
d_df : pd.DataFrame
Main drug dataframe containing drug-related columns.
drug_map : pd.DataFrame
Mapping dataframe containing drug information and the column 'isoSMILES'.
Mapping dataframe containing drug information and the column 'canSMILES'.

Returns
-------
pd.DataFrame
The merged dataframe containing combined drug information.
"""
print(d_df['isoSMILES'].dtype, drug_map['isoSMILES'].dtype)
d_df['isoSMILES'] = d_df['isoSMILES'].astype(str)
drug_map['isoSMILES'] = drug_map['isoSMILES'].astype(str)
result_df = d_df.merge(drug_map[['isoSMILES', 'improve_drug_id']], on='isoSMILES', how='left')
# print(d_df['isoSMILES'].dtype, drug_map['isoSMILES'].dtype)
d_df['canSMILES'] = d_df['canSMILES'].astype(str)
drug_map['canSMILES'] = drug_map['canSMILES'].astype(str)
result_df = d_df.merge(drug_map[['canSMILES', 'improve_drug_id']], on='canSMILES', how='left')
return result_df

def format_drug_map(drug_map_path):
"""
Format and clean up the drug mapping file.

Reads a drug map file, removes duplicates based on the 'isoSMILES' column,
Reads a drug map file, removes duplicates based on the 'canSMILES' column,
and returns the cleaned dataframe.

Parameters
Expand All @@ -282,11 +282,11 @@ def format_drug_map(drug_map_path):
"""
if drug_map_path:
drug_map = pd.read_csv(drug_map_path, sep = "\t")
drug_map = drug_map.drop_duplicates(subset='isoSMILES', keep='first')
drug_map = drug_map.drop_duplicates(subset='canSMILES', keep='first')
else:
drug_map = pd.DataFrame(columns=[
'improve_drug_id', 'chem_name', 'pubchem_id', 'canSMILES',
'isoSMILES', 'InChIKey', 'formula', 'weight'
'improve_drug_id', 'chem_name', 'pubchem_id',
'canSMILES', 'InChIKey', 'formula', 'weight'
])
return drug_map

Expand Down Expand Up @@ -316,7 +316,7 @@ def format_drug_df(drug_path):

def add_improve_id(previous_df, new_df):
"""
Add 'improve_drug_id' to the new dataframe based on unique 'isoSMILES' not present in the previous dataframe.
Add 'improve_drug_id' to the new dataframe based on unique 'canSMILES' not present in the previous dataframe.

Parameters
----------
Expand All @@ -335,16 +335,16 @@ def add_improve_id(previous_df, new_df):
max_id = max(id_list) if id_list else 0
else:
max_id = 0
# Identify isoSMILES in the new dataframe that don't exist in the old dataframe
unique_new_smiles = set(new_df['isoSMILES']) - set(previous_df['isoSMILES'])
# Identify rows in the new dataframe with isoSMILES that are unique and where improve_drug_id is NaN
mask = (new_df['isoSMILES'].isin(unique_new_smiles)) & (new_df['improve_drug_id'].isna())
# Identify canSMILES in the new dataframe that don't exist in the old dataframe
unique_new_smiles = set(new_df['canSMILES']) - set(previous_df['canSMILES'])
# Identify rows in the new dataframe with canSMILES that are unique and where improve_drug_id is NaN
mask = (new_df['canSMILES'].isin(unique_new_smiles)) & (new_df['improve_drug_id'].isna())
id_map = {}
for smiles in unique_new_smiles:
max_id += 1
id_map[smiles] = f"SMI_{max_id}"
# Apply the mapping to the new dataframe for rows with unique isoSMILES and NaN improve_drug_id
new_df.loc[mask, 'improve_drug_id'] = new_df['isoSMILES'].map(id_map)
# Apply the mapping to the new dataframe for rows with unique canSMILES and NaN improve_drug_id
new_df.loc[mask, 'improve_drug_id'] = new_df['canSMILES'].map(id_map)
return new_df


Expand Down Expand Up @@ -466,8 +466,14 @@ def map_and_combine(df, data_type, entrez_map_file, improve_map_file, map_file=N
right_on='other_id',
how='left')
mapped_df.insert(0, 'improve_sample_id', mapped_df.pop('improve_sample_id'))

print(mapped_df.to_string())
mapped_df['improve_sample_id'] = mapped_df['improve_sample_id'].astype(int)
mapped_df['entrez_id'] = mapped_df['entrez_id'].fillna(0)
mapped_df['entrez_id'] = mapped_df['entrez_id'].astype(int)
mapped_df['source'] = 'synapse'
mapped_df['study'] = 'BeatAML'
mapped_df =mapped_df.drop_duplicates()

final_dataframe = mapped_df.dropna()
return final_dataframe
Expand Down Expand Up @@ -541,7 +547,7 @@ def generate_drug_list(drug_map_path,drug_path):
d_res = add_improve_id(drug_map, d_res)
#Drug Data
#print(d_res)
drug_res = d_res[["improve_drug_id","chem_name","pubchem_id","formula","weight","InChIKey","canSMILES","isoSMILES"]]
drug_res = d_res[["improve_drug_id","chem_name","pubchem_id","formula","weight","InChIKey","canSMILES"]]
drug_res = drug_res.drop_duplicates()
drug_res.to_csv("/tmp/beataml_drugs.tsv",sep="\t", index=False)

Expand Down Expand Up @@ -587,7 +593,12 @@ def generate_drug_list(drug_map_path,drug_path):
# 'syn32533104',
# 'syn32529921',
'syn26642974',
'syn26427390'
'syn26427390',
'syn64126458',
'syn64126462',
'syn64126463',
'syn64126464',
'syn64126468'
]
print("Downloading Files from Synapse")
for entity_id in entity_ids:
Expand All @@ -597,13 +608,13 @@ def generate_drug_list(drug_map_path,drug_path):
#gene_url = "https://figshare.com/ndownloader/files/40576109?private_link=525f7777039f4610ef47"
#entrez_map_file = retrieve_figshare_data(gene_url)

additional_mapping_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_sample_mapping.xlsx"
# additional_mapping_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_sample_mapping.xlsx"
sample_mapping_file = "beataml_waves1to4_sample_mapping.xlsx"
download_from_github(additional_mapping_url, sample_mapping_file)
# download_from_github(additional_mapping_url, sample_mapping_file)

supplementary_url = 'https://ars.els-cdn.com/content/image/1-s2.0-S1535610822003129-mmc2.xlsx'
# supplementary_url = 'https://ars.els-cdn.com/content/image/1-s2.0-S1535610822003129-mmc2.xlsx'
supplimentary_file = '1-s2.0-S1535610822003129-mmc2.xlsx'
download_from_github(supplementary_url, supplimentary_file)
# download_from_github(supplementary_url, supplimentary_file)


if args.samples:
Expand All @@ -619,26 +630,26 @@ def generate_drug_list(drug_map_path,drug_path):
else:
print("Drug File Provided. Proceeding with build.")
original_drug_file = "beataml_wv1to4_raw_inhibitor_v4_dbgap.txt"
original_drug_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_wv1to4_raw_inhibitor_v4_dbgap.txt"
download_from_github(original_drug_url, original_drug_file)
generate_drug_list(args.drugFile, original_drug_file) ##this doesn't exist, need to add
# original_drug_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_wv1to4_raw_inhibitor_v4_dbgap.txt"
# download_from_github(original_drug_url, original_drug_file)
generate_drug_list(args.drugFile, original_drug_file)
if args.omics:
if args.genes is None or args.curSamples is None:
print('Cannot process omics without sample mapping and gene mapping files')
exit()
else:
improve_map_file = args.curSamples
transcriptomics_file = "beataml_waves1to4_counts_dbgap.txt" #"beataml_waves1to4_norm_exp_dbgap.txt" ##this is the wrong file, these are the normalize values
transcriptomics_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_counts_dbgap.txt" #"https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_norm_exp_dbgap.txt"
download_from_github(transcriptomics_url, transcriptomics_file)
# transcriptomics_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_counts_dbgap.txt" #"https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_norm_exp_dbgap.txt"
# download_from_github(transcriptomics_url, transcriptomics_file)

mutations_file = "beataml_wes_wv1to4_mutations_dbgap.txt"
mutations_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_wes_wv1to4_mutations_dbgap.txt"
download_from_github(mutations_url, mutations_file)
# mutations_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_wes_wv1to4_mutations_dbgap.txt"
# download_from_github(mutations_url, mutations_file)

mutation_map_file = "beataml_waves1to4_sample_mapping.xlsx"
mutation_map_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_sample_mapping.xlsx"
download_from_github(mutation_map_url, mutation_map_file)
# mutation_map_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_sample_mapping.xlsx"
# download_from_github(mutation_map_url, mutation_map_file)
# New Transcriptomics Data
print("Starting Transcriptomics Data")
##first run conversion tool
Expand Down Expand Up @@ -680,9 +691,9 @@ def generate_drug_list(drug_map_path,drug_path):
imp_samp_map = pd.read_csv(args.curSamples)
imp_drug_map = pd.read_csv(args.drugFile,sep='\t')
original_drug_file = "beataml_wv1to4_raw_inhibitor_v4_dbgap.txt"
original_drug_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_wv1to4_raw_inhibitor_v4_dbgap.txt"
# original_drug_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_wv1to4_raw_inhibitor_v4_dbgap.txt"
# Generate Raw Drugs File to use in Curve fitting algorithm
download_from_github(original_drug_url, original_drug_file)
# download_from_github(original_drug_url, original_drug_file)
# Experiment Data
updated_raw_drug_file = "beatAML_drug_raw.tsv"
generate_raw_drug_file(original_drug_file,sample_mapping_file, updated_raw_drug_file,supplimentary_file)
Expand Down
6 changes: 3 additions & 3 deletions build/broad_sanger/03a-nci60Drugs.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def main():
opts = parser.parse_args()

###primary DF
df = {'improve_drug_id':[],'chem_name':[],'canSMILES':[],'isoSMILES':[],\
df = {'improve_drug_id':[],'chem_name':[],'canSMILES':[],\
'InChIKey':[],'formula':[],'weight':[],'pubchem_id':[]}

print('Downloading NSC identifiers for nci60 data')
Expand Down Expand Up @@ -69,7 +69,7 @@ def main():
upper=[a.upper() for a in smiles['SMILES']]
smiles= pl.DataFrame({'NSC':smiles['NSC'],'upper':upper})#smiles.with_columns(upper=upper)
##reduce to smiels only in current drugs
ssmiles = smiles.filter(~pl.col('upper').is_in(curdrugs['isoSMILES']))
# ssmiles = smiles.filter(~pl.col('upper').is_in(curdrugs['isoSMILES']))
ssmiles = ssmiles.filter(~pl.col('upper').is_in(curdrugs['canSMILES']))
pubchems = pubchems.filter(pl.col('NSC').is_in(ssmiles['NSC']))
arr = set(pubchems['CID'])
Expand Down Expand Up @@ -102,7 +102,7 @@ def main():
{
"improve_drug_id": ["SMI_"+str(a) for a in range(max_imp+1,max_imp+1+smicount,1)],
'canSMILES': [a for a in set(mdf['SMILES'])],
'isoSMILES': [a for a in set(mdf['SMILES'])],
# 'isoSMILES': [a for a in set(mdf['SMILES'])],
'InChIKey': [None for a in range(smicount)],
'formula': [None for a in range(smicount)],
'weight': [None for a in range(smicount)]
Expand Down
5 changes: 3 additions & 2 deletions build/broad_sanger/04b-nci60-updated.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,10 +107,11 @@ def main():

finaldf = pl.DataFrame(
{
'source':['NCI60' for a in molar['improve_drug_id']], ##2024 build
'source':['NCI60_24' for a in molar['improve_drug_id']], ##2024 build
'improve_sample_id':molar['improve_sample_id'],
'Drug':molar['improve_drug_id'],
'study': molar['EXPID'],#['NCI60' for a in nonulls['improve_drug_id']],
# 'study': molar['EXPID'],#['NCI60' for a in nonulls['improve_drug_id']],
'study': "NCI60",
'time':molar['time'],
'time_unit':molar['time_unit'],
'DOSE': [(10**a)*1000000 for a in molar['CONCENTRATION']], ##move from molar to uM to match pharmacoDB
Expand Down
43 changes: 43 additions & 0 deletions build/broad_sanger/05a_remove_problem_drugs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import gc
import polars as pl



def main():

# Remove Problematic Drugs before Splitting Data

# Load the datasets
all_drugs = pl.read_csv("broad_sanger_drugs.tsv", separator="\t")
all_experiments = pl.read_csv("broad_sanger_experiments.tsv", separator="\t")

# Define the brd_list with lowercase entries for case-insensitive matching
brd_list = [
'brd-k03911514',
'brd-k07442505',
'brd-k13185470',
'brd-k16130065',
'brd-k20514654',
'brd-k27188169',
'brd-k55473186',
'yl54',
'brd-k58730230',
'brd-k79669418',
'brd-k99584050']

# Identify rows in all_drugs that match brd_list entries (case insensitive)
removed_drugs = all_drugs.filter(pl.col("chem_name").str.to_lowercase().is_in(brd_list))

# Store the improve_drug_id IDs of removed entries
improve_drug_id = removed_drugs["improve_drug_id"].to_list()

# Remove these rows from all_drugs and all_experiments
all_drugs = all_drugs.filter(~pl.col("improve_drug_id").is_in(improve_drug_id))
all_experiments = all_experiments.filter(~pl.col("improve_drug_id").is_in(improve_drug_id))

all_drugs.write_csv("broad_sanger_drugs.tsv", separator="\t")
all_experiments.write_csv("broad_sanger_experiments.tsv", separator="\t")


if __name__ == "__main__":
main()
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@


def main():

datasets_to_process = ["CCLE", "CTRPv2", "PRISM", "GDSCv1", "GDSCv2", "FIMM", "gCSI", "NCI60"]
omics_datatypes = ["transcriptomics","proteomics", "copy_number","mutations"] # csv
samples_datatypes = ["samples"] #csv
Expand Down
8 changes: 6 additions & 2 deletions build/broad_sanger/build_misc.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,12 @@ set -euo pipefail
trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit 1' ERR

cp /tmp/broad_sanger* .
echo "Running 05_separate_datasets.py..."
/opt/venv/bin/python 05_separate_datasets.py

echo "Running 05a_remove_problem_drugs.py..."
/opt/venv/bin/python 05a_remove_problem_drugs.py

echo "Running 05b_separate_datasets.py..."
/opt/venv/bin/python 05b_separate_datasets.py

echo "Removing broad_sanger* files..."
rm broad_sanger*
Loading