PNNL-CompBio · jjacobson95 · Dec 6, 2024 · Nov 13, 2024 · Nov 20, 2024 · Nov 21, 2024
diff --git a/build/beatAML/GetBeatAML.py b/build/beatAML/GetBeatAML.py
@@ -9,25 +9,25 @@
 import argparse
 import time
 
-def download_from_github(raw_url, save_path):
-    """
-    Download a file from a raw GitHub URL and save to the specified path.
-
-    Parameters
-    ----------
-    raw_url : str
-        The raw GitHub URL pointing to the file to be downloaded.
-    save_path : str
-        The local path where the downloaded file will be saved.
-
-    Returns
-    -------
-    None
-    """
-    response = requests.get(raw_url)
-    with open(save_path, 'wb') as f:
-        f.write(response.content)
-    return
+# def download_from_github(raw_url, save_path):
+#     """
+#     Download a file from a raw GitHub URL and save to the specified path.
+
+#     Parameters
+#     ----------
+#     raw_url : str
+#         The raw GitHub URL pointing to the file to be downloaded.
+#     save_path : str
+#         The local path where the downloaded file will be saved.
+
+#     Returns
+#     -------
+#     None
+#     """
+#     response = requests.get(raw_url)
+#     with open(save_path, 'wb') as f:
+#         f.write(response.content)
+#     return
 
 def retrieve_figshare_data(url):
     """
@@ -178,14 +178,14 @@ def retrieve_drug_info(compound_name):
         properties = data["PropertyTable"]["Properties"][0]
         pubchem_id = properties.get('CID',np.nan)
         canSMILES = properties.get("CanonicalSMILES", np.nan)
-        isoSMILES = properties.get("IsomericSMILES", np.nan)
+        # isoSMILES = properties.get("IsomericSMILES", np.nan)
         InChIKey = properties.get("InChIKey", np.nan)
         formula = properties.get("MolecularFormula", np.nan)
         weight = properties.get("MolecularWeight", np.nan)
 
-        return pubchem_id, canSMILES, isoSMILES, InChIKey, formula, weight
+        return pubchem_id, canSMILES, InChIKey, formula, weight
     else:
-        return np.nan, np.nan, np.nan, np.nan, np.nan, np.nan
+        return np.nan, np.nan, np.nan, np.nan, np.nan
 
 
 def update_dataframe_with_pubchem(d_df):
@@ -230,14 +230,14 @@ def update_dataframe_with_pubchem(d_df):
         if row['chem_name'] in data_dict and not all(pd.isna(val) for val in data_dict[row['chem_name']]):
             values = data_dict[row['chem_name']]
         else:
-            values = data_dict.get(row['other_name'], (np.nan, np.nan, np.nan, np.nan, np.nan, np.nan))
+            values = data_dict.get(row['other_name'], (np.nan, np.nan, np.nan, np.nan, np.nan))
 
         d_df.at[idx, 'pubchem_id'] = values[0]
         d_df.at[idx, "canSMILES"] = values[1]
-        d_df.at[idx, "isoSMILES"] = values[2]
-        d_df.at[idx, "InChIKey"] = values[3]
-        d_df.at[idx, "formula"] = values[4]
-        d_df.at[idx, "weight"] = values[5]
+        # d_df.at[idx, "isoSMILES"] = values[2]
+        d_df.at[idx, "InChIKey"] = values[2]
+        d_df.at[idx, "formula"] = values[3]
+        d_df.at[idx, "weight"] = values[4]
 
     return d_df
 
@@ -250,24 +250,24 @@ def merge_drug_info(d_df,drug_map):
     d_df : pd.DataFrame
         Main drug dataframe containing drug-related columns.
     drug_map : pd.DataFrame
-        Mapping dataframe containing drug information and the column 'isoSMILES'.
+        Mapping dataframe containing drug information and the column 'canSMILES'.
 
     Returns
     -------
     pd.DataFrame
         The merged dataframe containing combined drug information.
     """
-    print(d_df['isoSMILES'].dtype, drug_map['isoSMILES'].dtype)
-    d_df['isoSMILES'] = d_df['isoSMILES'].astype(str)
-    drug_map['isoSMILES'] = drug_map['isoSMILES'].astype(str)
-    result_df = d_df.merge(drug_map[['isoSMILES', 'improve_drug_id']], on='isoSMILES', how='left')
+    # print(d_df['isoSMILES'].dtype, drug_map['isoSMILES'].dtype)
+    d_df['canSMILES'] = d_df['canSMILES'].astype(str)
+    drug_map['canSMILES'] = drug_map['canSMILES'].astype(str)
+    result_df = d_df.merge(drug_map[['canSMILES', 'improve_drug_id']], on='canSMILES', how='left')
     return result_df
 
 def format_drug_map(drug_map_path):
     """
     Format and clean up the drug mapping file.
 
-    Reads a drug map file, removes duplicates based on the 'isoSMILES' column,
+    Reads a drug map file, removes duplicates based on the 'canSMILES' column,
     and returns the cleaned dataframe.
 
     Parameters
@@ -282,11 +282,11 @@ def format_drug_map(drug_map_path):
     """
     if drug_map_path:
         drug_map = pd.read_csv(drug_map_path, sep = "\t")
-        drug_map = drug_map.drop_duplicates(subset='isoSMILES', keep='first')
+        drug_map = drug_map.drop_duplicates(subset='canSMILES', keep='first')
     else:
         drug_map = pd.DataFrame(columns=[
-            'improve_drug_id', 'chem_name', 'pubchem_id', 'canSMILES', 
-            'isoSMILES', 'InChIKey', 'formula', 'weight'
+            'improve_drug_id', 'chem_name', 'pubchem_id',
+            'canSMILES', 'InChIKey', 'formula', 'weight'
         ])
     return drug_map
 
@@ -316,7 +316,7 @@ def format_drug_df(drug_path):
 
 def add_improve_id(previous_df, new_df):
     """
-    Add 'improve_drug_id' to the new dataframe based on unique 'isoSMILES' not present in the previous dataframe.
+    Add 'improve_drug_id' to the new dataframe based on unique 'canSMILES' not present in the previous dataframe.
 
     Parameters
     ----------
@@ -335,16 +335,16 @@ def add_improve_id(previous_df, new_df):
         max_id = max(id_list) if id_list else 0
     else:
         max_id = 0
-    # Identify isoSMILES in the new dataframe that don't exist in the old dataframe
-    unique_new_smiles = set(new_df['isoSMILES']) - set(previous_df['isoSMILES'])
-    # Identify rows in the new dataframe with isoSMILES that are unique and where improve_drug_id is NaN
-    mask = (new_df['isoSMILES'].isin(unique_new_smiles)) & (new_df['improve_drug_id'].isna())
+    # Identify canSMILES in the new dataframe that don't exist in the old dataframe
+    unique_new_smiles = set(new_df['canSMILES']) - set(previous_df['canSMILES'])
+    # Identify rows in the new dataframe with canSMILES that are unique and where improve_drug_id is NaN
+    mask = (new_df['canSMILES'].isin(unique_new_smiles)) & (new_df['improve_drug_id'].isna())
     id_map = {}
     for smiles in unique_new_smiles:
         max_id += 1
         id_map[smiles] = f"SMI_{max_id}"
-    # Apply the mapping to the new dataframe for rows with unique isoSMILES and NaN improve_drug_id
-    new_df.loc[mask, 'improve_drug_id'] = new_df['isoSMILES'].map(id_map)
+    # Apply the mapping to the new dataframe for rows with unique canSMILES and NaN improve_drug_id
+    new_df.loc[mask, 'improve_drug_id'] = new_df['canSMILES'].map(id_map)
     return new_df
 
 
@@ -466,8 +466,14 @@ def map_and_combine(df, data_type, entrez_map_file, improve_map_file, map_file=N
                          right_on='other_id',
                          how='left')
     mapped_df.insert(0, 'improve_sample_id', mapped_df.pop('improve_sample_id'))
+
+    print(mapped_df.to_string())
+    mapped_df['improve_sample_id'] = mapped_df['improve_sample_id'].astype(int)
+    mapped_df['entrez_id'] = mapped_df['entrez_id'].fillna(0)
+    mapped_df['entrez_id'] = mapped_df['entrez_id'].astype(int)
     mapped_df['source'] = 'synapse'
     mapped_df['study'] = 'BeatAML'
+    mapped_df =mapped_df.drop_duplicates()
 
     final_dataframe = mapped_df.dropna()
     return final_dataframe
@@ -541,7 +547,7 @@ def generate_drug_list(drug_map_path,drug_path):
     d_res = add_improve_id(drug_map, d_res)
     #Drug Data
     #print(d_res)
-    drug_res = d_res[["improve_drug_id","chem_name","pubchem_id","formula","weight","InChIKey","canSMILES","isoSMILES"]]
+    drug_res = d_res[["improve_drug_id","chem_name","pubchem_id","formula","weight","InChIKey","canSMILES"]]
     drug_res = drug_res.drop_duplicates()
     drug_res.to_csv("/tmp/beataml_drugs.tsv",sep="\t", index=False)
 
@@ -587,7 +593,12 @@ def generate_drug_list(drug_map_path,drug_path):
 #         'syn32533104', 
 #         'syn32529921', 
         'syn26642974',
-        'syn26427390'
+        'syn26427390',
+        'syn64126458',
+        'syn64126462',
+        'syn64126463',
+        'syn64126464',
+        'syn64126468'
     ]
     print("Downloading Files from Synapse")
     for entity_id in entity_ids:
@@ -597,13 +608,13 @@ def generate_drug_list(drug_map_path,drug_path):
     #gene_url = "https://figshare.com/ndownloader/files/40576109?private_link=525f7777039f4610ef47"
     #entrez_map_file = retrieve_figshare_data(gene_url)
 
-    additional_mapping_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_sample_mapping.xlsx"
+    # additional_mapping_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_sample_mapping.xlsx"
     sample_mapping_file = "beataml_waves1to4_sample_mapping.xlsx"
-    download_from_github(additional_mapping_url, sample_mapping_file)
+    # download_from_github(additional_mapping_url, sample_mapping_file)
 
-    supplementary_url = 'https://ars.els-cdn.com/content/image/1-s2.0-S1535610822003129-mmc2.xlsx'
+    # supplementary_url = 'https://ars.els-cdn.com/content/image/1-s2.0-S1535610822003129-mmc2.xlsx'
     supplimentary_file = '1-s2.0-S1535610822003129-mmc2.xlsx'
-    download_from_github(supplementary_url, supplimentary_file)
+    # download_from_github(supplementary_url, supplimentary_file)
 
 
     if args.samples:
@@ -619,26 +630,26 @@ def generate_drug_list(drug_map_path,drug_path):
         else:
             print("Drug File Provided. Proceeding with build.")
         original_drug_file = "beataml_wv1to4_raw_inhibitor_v4_dbgap.txt"
-        original_drug_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_wv1to4_raw_inhibitor_v4_dbgap.txt"
-        download_from_github(original_drug_url, original_drug_file)
-        generate_drug_list(args.drugFile, original_drug_file) ##this doesn't exist, need to add
+        # original_drug_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_wv1to4_raw_inhibitor_v4_dbgap.txt"
+        # download_from_github(original_drug_url, original_drug_file)
+        generate_drug_list(args.drugFile, original_drug_file) 
     if args.omics:
         if args.genes is None or args.curSamples is None:
             print('Cannot process omics without sample mapping and gene mapping files')
             exit()
         else:
             improve_map_file = args.curSamples
             transcriptomics_file = "beataml_waves1to4_counts_dbgap.txt" #"beataml_waves1to4_norm_exp_dbgap.txt" ##this is the wrong file, these are the normalize values
-            transcriptomics_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_counts_dbgap.txt" #"https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_norm_exp_dbgap.txt"
-            download_from_github(transcriptomics_url, transcriptomics_file)
+            # transcriptomics_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_counts_dbgap.txt" #"https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_norm_exp_dbgap.txt"
+            # download_from_github(transcriptomics_url, transcriptomics_file)
 
             mutations_file = "beataml_wes_wv1to4_mutations_dbgap.txt"
-            mutations_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_wes_wv1to4_mutations_dbgap.txt"
-            download_from_github(mutations_url, mutations_file)
+            # mutations_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_wes_wv1to4_mutations_dbgap.txt"
+            # download_from_github(mutations_url, mutations_file)
 
             mutation_map_file = "beataml_waves1to4_sample_mapping.xlsx"
-            mutation_map_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_sample_mapping.xlsx"
-            download_from_github(mutation_map_url, mutation_map_file)
+            # mutation_map_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_sample_mapping.xlsx"
+            # download_from_github(mutation_map_url, mutation_map_file)
             # New Transcriptomics Data
             print("Starting Transcriptomics Data")
             ##first run conversion tool
@@ -680,9 +691,9 @@ def generate_drug_list(drug_map_path,drug_path):
             imp_samp_map = pd.read_csv(args.curSamples)
             imp_drug_map = pd.read_csv(args.drugFile,sep='\t')
             original_drug_file = "beataml_wv1to4_raw_inhibitor_v4_dbgap.txt"
-            original_drug_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_wv1to4_raw_inhibitor_v4_dbgap.txt"    
+            # original_drug_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_wv1to4_raw_inhibitor_v4_dbgap.txt"    
             # Generate Raw Drugs File to use in Curve fitting algorithm
-            download_from_github(original_drug_url, original_drug_file)
+            # download_from_github(original_drug_url, original_drug_file)
              # Experiment Data
             updated_raw_drug_file = "beatAML_drug_raw.tsv"
             generate_raw_drug_file(original_drug_file,sample_mapping_file, updated_raw_drug_file,supplimentary_file)

diff --git a/build/broad_sanger/03a-nci60Drugs.py b/build/broad_sanger/03a-nci60Drugs.py
@@ -39,7 +39,7 @@ def main():
     opts = parser.parse_args()
 
     ###primary DF
-    df = {'improve_drug_id':[],'chem_name':[],'canSMILES':[],'isoSMILES':[],\
+    df = {'improve_drug_id':[],'chem_name':[],'canSMILES':[],\
           'InChIKey':[],'formula':[],'weight':[],'pubchem_id':[]}
 
     print('Downloading NSC identifiers for nci60 data')
@@ -69,7 +69,7 @@ def main():
         upper=[a.upper() for a in smiles['SMILES']]
         smiles= pl.DataFrame({'NSC':smiles['NSC'],'upper':upper})#smiles.with_columns(upper=upper)
         ##reduce to smiels only in current drugs
-        ssmiles = smiles.filter(~pl.col('upper').is_in(curdrugs['isoSMILES']))
+        # ssmiles = smiles.filter(~pl.col('upper').is_in(curdrugs['isoSMILES']))
         ssmiles = ssmiles.filter(~pl.col('upper').is_in(curdrugs['canSMILES']))
         pubchems = pubchems.filter(pl.col('NSC').is_in(ssmiles['NSC']))
         arr = set(pubchems['CID'])
@@ -102,7 +102,7 @@ def main():
         {
             "improve_drug_id": ["SMI_"+str(a) for a in range(max_imp+1,max_imp+1+smicount,1)],
             'canSMILES': [a for a in set(mdf['SMILES'])],
-            'isoSMILES': [a for a in set(mdf['SMILES'])],
+            # 'isoSMILES': [a for a in set(mdf['SMILES'])],
             'InChIKey': [None for a in range(smicount)],
             'formula': [None for a in range(smicount)],
             'weight': [None for a in range(smicount)]

diff --git a/build/broad_sanger/04b-nci60-updated.py b/build/broad_sanger/04b-nci60-updated.py
@@ -107,10 +107,11 @@ def main():
 
     finaldf = pl.DataFrame(
         {
-            'source':['NCI60' for a in molar['improve_drug_id']], ##2024 build
+            'source':['NCI60_24' for a in molar['improve_drug_id']], ##2024 build
             'improve_sample_id':molar['improve_sample_id'],
             'Drug':molar['improve_drug_id'],
-            'study': molar['EXPID'],#['NCI60' for a in nonulls['improve_drug_id']],
+            # 'study': molar['EXPID'],#['NCI60' for a in nonulls['improve_drug_id']],
+            'study': "NCI60",
             'time':molar['time'],
             'time_unit':molar['time_unit'],
             'DOSE': [(10**a)*1000000 for a in molar['CONCENTRATION']], ##move from molar to uM to match pharmacoDB

diff --git a/build/broad_sanger/05a_remove_problem_drugs.py b/build/broad_sanger/05a_remove_problem_drugs.py
@@ -0,0 +1,43 @@
+import gc
+import polars as pl 
+
+
+
+def main():
+
+    # Remove Problematic Drugs before Splitting Data
+
+    # Load the datasets
+    all_drugs = pl.read_csv("broad_sanger_drugs.tsv", separator="\t")
+    all_experiments = pl.read_csv("broad_sanger_experiments.tsv", separator="\t")
+
+    # Define the brd_list with lowercase entries for case-insensitive matching
+    brd_list = [
+    'brd-k03911514',
+    'brd-k07442505',
+    'brd-k13185470',
+    'brd-k16130065',
+    'brd-k20514654',
+    'brd-k27188169',
+    'brd-k55473186',
+    'yl54',
+    'brd-k58730230',
+    'brd-k79669418',
+    'brd-k99584050']
+
+    # Identify rows in all_drugs that match brd_list entries (case insensitive)
+    removed_drugs = all_drugs.filter(pl.col("chem_name").str.to_lowercase().is_in(brd_list))
+
+    # Store the improve_drug_id IDs of removed entries
+    improve_drug_id = removed_drugs["improve_drug_id"].to_list()
+
+    # Remove these rows from all_drugs and all_experiments
+    all_drugs = all_drugs.filter(~pl.col("improve_drug_id").is_in(improve_drug_id))
+    all_experiments = all_experiments.filter(~pl.col("improve_drug_id").is_in(improve_drug_id))
+
+    all_drugs.write_csv("broad_sanger_drugs.tsv", separator="\t")
+    all_experiments.write_csv("broad_sanger_experiments.tsv", separator="\t")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/build/broad_sanger/05_separate_datasets.py → build/broad_sanger/05b_separate_datasets.py b/build/broad_sanger/05_separate_datasets.py → build/broad_sanger/05b_separate_datasets.py
@@ -4,7 +4,6 @@
 
 
 def main():
-
     datasets_to_process = ["CCLE", "CTRPv2", "PRISM", "GDSCv1", "GDSCv2", "FIMM", "gCSI", "NCI60"]
     omics_datatypes = ["transcriptomics","proteomics", "copy_number","mutations"] # csv 
     samples_datatypes = ["samples"] #csv

diff --git a/build/broad_sanger/build_misc.sh b/build/broad_sanger/build_misc.sh
@@ -4,8 +4,12 @@ set -euo pipefail
 trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit 1' ERR
 
 cp /tmp/broad_sanger* .
-echo "Running 05_separate_datasets.py..."
-/opt/venv/bin/python 05_separate_datasets.py
+
+echo "Running 05a_remove_problem_drugs.py..."
+/opt/venv/bin/python 05a_remove_problem_drugs.py
+
+echo "Running 05b_separate_datasets.py..."
+/opt/venv/bin/python 05b_separate_datasets.py
 
 echo "Removing broad_sanger* files..."
 rm broad_sanger*