diff --git a/build/beatAML/GetBeatAML.py b/build/beatAML/GetBeatAML.py
index 5afdff8e..c963db93 100755
--- a/build/beatAML/GetBeatAML.py
+++ b/build/beatAML/GetBeatAML.py
@@ -9,25 +9,25 @@
 import argparse
 import time
 
-def download_from_github(raw_url, save_path):
-    """
-    Download a file from a raw GitHub URL and save to the specified path.
-
-    Parameters
-    ----------
-    raw_url : str
-        The raw GitHub URL pointing to the file to be downloaded.
-    save_path : str
-        The local path where the downloaded file will be saved.
-
-    Returns
-    -------
-    None
-    """
-    response = requests.get(raw_url)
-    with open(save_path, 'wb') as f:
-        f.write(response.content)
-    return
+# def download_from_github(raw_url, save_path):
+#     """
+#     Download a file from a raw GitHub URL and save to the specified path.
+
+#     Parameters
+#     ----------
+#     raw_url : str
+#         The raw GitHub URL pointing to the file to be downloaded.
+#     save_path : str
+#         The local path where the downloaded file will be saved.
+
+#     Returns
+#     -------
+#     None
+#     """
+#     response = requests.get(raw_url)
+#     with open(save_path, 'wb') as f:
+#         f.write(response.content)
+#     return
 
 def retrieve_figshare_data(url):
     """
@@ -178,14 +178,14 @@ def retrieve_drug_info(compound_name):
         properties = data["PropertyTable"]["Properties"][0]
         pubchem_id = properties.get('CID',np.nan)
         canSMILES = properties.get("CanonicalSMILES", np.nan)
-        isoSMILES = properties.get("IsomericSMILES", np.nan)
+        # isoSMILES = properties.get("IsomericSMILES", np.nan)
         InChIKey = properties.get("InChIKey", np.nan)
         formula = properties.get("MolecularFormula", np.nan)
         weight = properties.get("MolecularWeight", np.nan)
 
-        return pubchem_id, canSMILES, isoSMILES, InChIKey, formula, weight
+        return pubchem_id, canSMILES, InChIKey, formula, weight
     else:
-        return np.nan, np.nan, np.nan, np.nan, np.nan, np.nan
+        return np.nan, np.nan, np.nan, np.nan, np.nan
     
 
 def update_dataframe_with_pubchem(d_df):
@@ -230,14 +230,14 @@ def update_dataframe_with_pubchem(d_df):
         if row['chem_name'] in data_dict and not all(pd.isna(val) for val in data_dict[row['chem_name']]):
             values = data_dict[row['chem_name']]
         else:
-            values = data_dict.get(row['other_name'], (np.nan, np.nan, np.nan, np.nan, np.nan, np.nan))
+            values = data_dict.get(row['other_name'], (np.nan, np.nan, np.nan, np.nan, np.nan))
 
         d_df.at[idx, 'pubchem_id'] = values[0]
         d_df.at[idx, "canSMILES"] = values[1]
-        d_df.at[idx, "isoSMILES"] = values[2]
-        d_df.at[idx, "InChIKey"] = values[3]
-        d_df.at[idx, "formula"] = values[4]
-        d_df.at[idx, "weight"] = values[5]
+        # d_df.at[idx, "isoSMILES"] = values[2]
+        d_df.at[idx, "InChIKey"] = values[2]
+        d_df.at[idx, "formula"] = values[3]
+        d_df.at[idx, "weight"] = values[4]
     
     return d_df
 
@@ -250,24 +250,24 @@ def merge_drug_info(d_df,drug_map):
     d_df : pd.DataFrame
         Main drug dataframe containing drug-related columns.
     drug_map : pd.DataFrame
-        Mapping dataframe containing drug information and the column 'isoSMILES'.
+        Mapping dataframe containing drug information and the column 'canSMILES'.
 
     Returns
     -------
     pd.DataFrame
         The merged dataframe containing combined drug information.
     """
-    print(d_df['isoSMILES'].dtype, drug_map['isoSMILES'].dtype)
-    d_df['isoSMILES'] = d_df['isoSMILES'].astype(str)
-    drug_map['isoSMILES'] = drug_map['isoSMILES'].astype(str)
-    result_df = d_df.merge(drug_map[['isoSMILES', 'improve_drug_id']], on='isoSMILES', how='left')
+    # print(d_df['isoSMILES'].dtype, drug_map['isoSMILES'].dtype)
+    d_df['canSMILES'] = d_df['canSMILES'].astype(str)
+    drug_map['canSMILES'] = drug_map['canSMILES'].astype(str)
+    result_df = d_df.merge(drug_map[['canSMILES', 'improve_drug_id']], on='canSMILES', how='left')
     return result_df
 
 def format_drug_map(drug_map_path):
     """
     Format and clean up the drug mapping file.
 
-    Reads a drug map file, removes duplicates based on the 'isoSMILES' column,
+    Reads a drug map file, removes duplicates based on the 'canSMILES' column,
     and returns the cleaned dataframe.
 
     Parameters
@@ -282,11 +282,11 @@ def format_drug_map(drug_map_path):
     """
     if drug_map_path:
         drug_map = pd.read_csv(drug_map_path, sep = "\t")
-        drug_map = drug_map.drop_duplicates(subset='isoSMILES', keep='first')
+        drug_map = drug_map.drop_duplicates(subset='canSMILES', keep='first')
     else:
         drug_map = pd.DataFrame(columns=[
-            'improve_drug_id', 'chem_name', 'pubchem_id', 'canSMILES', 
-            'isoSMILES', 'InChIKey', 'formula', 'weight'
+            'improve_drug_id', 'chem_name', 'pubchem_id',
+            'canSMILES', 'InChIKey', 'formula', 'weight'
         ])
     return drug_map
 
@@ -316,7 +316,7 @@ def format_drug_df(drug_path):
 
 def add_improve_id(previous_df, new_df):
     """
-    Add 'improve_drug_id' to the new dataframe based on unique 'isoSMILES' not present in the previous dataframe.
+    Add 'improve_drug_id' to the new dataframe based on unique 'canSMILES' not present in the previous dataframe.
 
     Parameters
     ----------
@@ -335,16 +335,16 @@ def add_improve_id(previous_df, new_df):
         max_id = max(id_list) if id_list else 0
     else:
         max_id = 0
-    # Identify isoSMILES in the new dataframe that don't exist in the old dataframe
-    unique_new_smiles = set(new_df['isoSMILES']) - set(previous_df['isoSMILES'])
-    # Identify rows in the new dataframe with isoSMILES that are unique and where improve_drug_id is NaN
-    mask = (new_df['isoSMILES'].isin(unique_new_smiles)) & (new_df['improve_drug_id'].isna())
+    # Identify canSMILES in the new dataframe that don't exist in the old dataframe
+    unique_new_smiles = set(new_df['canSMILES']) - set(previous_df['canSMILES'])
+    # Identify rows in the new dataframe with canSMILES that are unique and where improve_drug_id is NaN
+    mask = (new_df['canSMILES'].isin(unique_new_smiles)) & (new_df['improve_drug_id'].isna())
     id_map = {}
     for smiles in unique_new_smiles:
         max_id += 1
         id_map[smiles] = f"SMI_{max_id}"
-    # Apply the mapping to the new dataframe for rows with unique isoSMILES and NaN improve_drug_id
-    new_df.loc[mask, 'improve_drug_id'] = new_df['isoSMILES'].map(id_map)
+    # Apply the mapping to the new dataframe for rows with unique canSMILES and NaN improve_drug_id
+    new_df.loc[mask, 'improve_drug_id'] = new_df['canSMILES'].map(id_map)
     return new_df
 
 
@@ -466,8 +466,14 @@ def map_and_combine(df, data_type, entrez_map_file, improve_map_file, map_file=N
                          right_on='other_id',
                          how='left')
     mapped_df.insert(0, 'improve_sample_id', mapped_df.pop('improve_sample_id'))
+    
+    print(mapped_df.to_string())
+    mapped_df['improve_sample_id'] = mapped_df['improve_sample_id'].astype(int)
+    mapped_df['entrez_id'] = mapped_df['entrez_id'].fillna(0)
+    mapped_df['entrez_id'] = mapped_df['entrez_id'].astype(int)
     mapped_df['source'] = 'synapse'
     mapped_df['study'] = 'BeatAML'
+    mapped_df =mapped_df.drop_duplicates()
 
     final_dataframe = mapped_df.dropna()
     return final_dataframe
@@ -541,7 +547,7 @@ def generate_drug_list(drug_map_path,drug_path):
     d_res = add_improve_id(drug_map, d_res)
     #Drug Data
     #print(d_res)
-    drug_res = d_res[["improve_drug_id","chem_name","pubchem_id","formula","weight","InChIKey","canSMILES","isoSMILES"]]
+    drug_res = d_res[["improve_drug_id","chem_name","pubchem_id","formula","weight","InChIKey","canSMILES"]]
     drug_res = drug_res.drop_duplicates()
     drug_res.to_csv("/tmp/beataml_drugs.tsv",sep="\t", index=False)
 
@@ -587,7 +593,12 @@ def generate_drug_list(drug_map_path,drug_path):
 #         'syn32533104', 
 #         'syn32529921', 
         'syn26642974',
-        'syn26427390'
+        'syn26427390',
+        'syn64126458',
+        'syn64126462',
+        'syn64126463',
+        'syn64126464',
+        'syn64126468'
     ]
     print("Downloading Files from Synapse")
     for entity_id in entity_ids:
@@ -597,13 +608,13 @@ def generate_drug_list(drug_map_path,drug_path):
     #gene_url = "https://figshare.com/ndownloader/files/40576109?private_link=525f7777039f4610ef47"
     #entrez_map_file = retrieve_figshare_data(gene_url)
 
-    additional_mapping_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_sample_mapping.xlsx"
+    # additional_mapping_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_sample_mapping.xlsx"
     sample_mapping_file = "beataml_waves1to4_sample_mapping.xlsx"
-    download_from_github(additional_mapping_url, sample_mapping_file)
+    # download_from_github(additional_mapping_url, sample_mapping_file)
 
-    supplementary_url = 'https://ars.els-cdn.com/content/image/1-s2.0-S1535610822003129-mmc2.xlsx'
+    # supplementary_url = 'https://ars.els-cdn.com/content/image/1-s2.0-S1535610822003129-mmc2.xlsx'
     supplimentary_file = '1-s2.0-S1535610822003129-mmc2.xlsx'
-    download_from_github(supplementary_url, supplimentary_file)
+    # download_from_github(supplementary_url, supplimentary_file)
     
     
     if args.samples:
@@ -619,9 +630,9 @@ def generate_drug_list(drug_map_path,drug_path):
         else:
             print("Drug File Provided. Proceeding with build.")
         original_drug_file = "beataml_wv1to4_raw_inhibitor_v4_dbgap.txt"
-        original_drug_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_wv1to4_raw_inhibitor_v4_dbgap.txt"
-        download_from_github(original_drug_url, original_drug_file)
-        generate_drug_list(args.drugFile, original_drug_file) ##this doesn't exist, need to add
+        # original_drug_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_wv1to4_raw_inhibitor_v4_dbgap.txt"
+        # download_from_github(original_drug_url, original_drug_file)
+        generate_drug_list(args.drugFile, original_drug_file) 
     if args.omics:
         if args.genes is None or args.curSamples is None:
             print('Cannot process omics without sample mapping and gene mapping files')
@@ -629,16 +640,16 @@ def generate_drug_list(drug_map_path,drug_path):
         else:
             improve_map_file = args.curSamples
             transcriptomics_file = "beataml_waves1to4_counts_dbgap.txt" #"beataml_waves1to4_norm_exp_dbgap.txt" ##this is the wrong file, these are the normalize values
-            transcriptomics_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_counts_dbgap.txt" #"https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_norm_exp_dbgap.txt"
-            download_from_github(transcriptomics_url, transcriptomics_file)
+            # transcriptomics_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_counts_dbgap.txt" #"https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_norm_exp_dbgap.txt"
+            # download_from_github(transcriptomics_url, transcriptomics_file)
             
             mutations_file = "beataml_wes_wv1to4_mutations_dbgap.txt"
-            mutations_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_wes_wv1to4_mutations_dbgap.txt"
-            download_from_github(mutations_url, mutations_file)
+            # mutations_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_wes_wv1to4_mutations_dbgap.txt"
+            # download_from_github(mutations_url, mutations_file)
             
             mutation_map_file = "beataml_waves1to4_sample_mapping.xlsx"
-            mutation_map_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_sample_mapping.xlsx"
-            download_from_github(mutation_map_url, mutation_map_file)
+            # mutation_map_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_sample_mapping.xlsx"
+            # download_from_github(mutation_map_url, mutation_map_file)
             # New Transcriptomics Data
             print("Starting Transcriptomics Data")
             ##first run conversion tool
@@ -680,9 +691,9 @@ def generate_drug_list(drug_map_path,drug_path):
             imp_samp_map = pd.read_csv(args.curSamples)
             imp_drug_map = pd.read_csv(args.drugFile,sep='\t')
             original_drug_file = "beataml_wv1to4_raw_inhibitor_v4_dbgap.txt"
-            original_drug_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_wv1to4_raw_inhibitor_v4_dbgap.txt"    
+            # original_drug_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_wv1to4_raw_inhibitor_v4_dbgap.txt"    
             # Generate Raw Drugs File to use in Curve fitting algorithm
-            download_from_github(original_drug_url, original_drug_file)
+            # download_from_github(original_drug_url, original_drug_file)
              # Experiment Data
             updated_raw_drug_file = "beatAML_drug_raw.tsv"
             generate_raw_drug_file(original_drug_file,sample_mapping_file, updated_raw_drug_file,supplimentary_file)
diff --git a/build/broad_sanger/03a-nci60Drugs.py b/build/broad_sanger/03a-nci60Drugs.py
index 28720c90..529c002e 100644
--- a/build/broad_sanger/03a-nci60Drugs.py
+++ b/build/broad_sanger/03a-nci60Drugs.py
@@ -39,7 +39,7 @@ def main():
     opts = parser.parse_args()
 
     ###primary DF
-    df = {'improve_drug_id':[],'chem_name':[],'canSMILES':[],'isoSMILES':[],\
+    df = {'improve_drug_id':[],'chem_name':[],'canSMILES':[],\
           'InChIKey':[],'formula':[],'weight':[],'pubchem_id':[]}
 
     print('Downloading NSC identifiers for nci60 data')
@@ -69,7 +69,7 @@ def main():
         upper=[a.upper() for a in smiles['SMILES']]
         smiles= pl.DataFrame({'NSC':smiles['NSC'],'upper':upper})#smiles.with_columns(upper=upper)
         ##reduce to smiels only in current drugs
-        ssmiles = smiles.filter(~pl.col('upper').is_in(curdrugs['isoSMILES']))
+        # ssmiles = smiles.filter(~pl.col('upper').is_in(curdrugs['isoSMILES']))
         ssmiles = ssmiles.filter(~pl.col('upper').is_in(curdrugs['canSMILES']))
         pubchems = pubchems.filter(pl.col('NSC').is_in(ssmiles['NSC']))
         arr = set(pubchems['CID'])
@@ -102,7 +102,7 @@ def main():
         {
             "improve_drug_id": ["SMI_"+str(a) for a in range(max_imp+1,max_imp+1+smicount,1)],
             'canSMILES': [a for a in set(mdf['SMILES'])],
-            'isoSMILES': [a for a in set(mdf['SMILES'])],
+            # 'isoSMILES': [a for a in set(mdf['SMILES'])],
             'InChIKey': [None for a in range(smicount)],
             'formula': [None for a in range(smicount)],
             'weight': [None for a in range(smicount)]
diff --git a/build/broad_sanger/04b-nci60-updated.py b/build/broad_sanger/04b-nci60-updated.py
index b6a82001..c515f59d 100644
--- a/build/broad_sanger/04b-nci60-updated.py
+++ b/build/broad_sanger/04b-nci60-updated.py
@@ -107,10 +107,11 @@ def main():
     
     finaldf = pl.DataFrame(
         {
-            'source':['NCI60' for a in molar['improve_drug_id']], ##2024 build
+            'source':['NCI60_24' for a in molar['improve_drug_id']], ##2024 build
             'improve_sample_id':molar['improve_sample_id'],
             'Drug':molar['improve_drug_id'],
-            'study': molar['EXPID'],#['NCI60' for a in nonulls['improve_drug_id']],
+            # 'study': molar['EXPID'],#['NCI60' for a in nonulls['improve_drug_id']],
+            'study': "NCI60",
             'time':molar['time'],
             'time_unit':molar['time_unit'],
             'DOSE': [(10**a)*1000000 for a in molar['CONCENTRATION']], ##move from molar to uM to match pharmacoDB
diff --git a/build/broad_sanger/05a_remove_problem_drugs.py b/build/broad_sanger/05a_remove_problem_drugs.py
new file mode 100644
index 00000000..a240fc14
--- /dev/null
+++ b/build/broad_sanger/05a_remove_problem_drugs.py
@@ -0,0 +1,43 @@
+import gc
+import polars as pl 
+
+
+
+def main():
+
+    # Remove Problematic Drugs before Splitting Data
+    
+    # Load the datasets
+    all_drugs = pl.read_csv("broad_sanger_drugs.tsv", separator="\t")
+    all_experiments = pl.read_csv("broad_sanger_experiments.tsv", separator="\t")
+
+    # Define the brd_list with lowercase entries for case-insensitive matching
+    brd_list = [
+    'brd-k03911514',
+    'brd-k07442505',
+    'brd-k13185470',
+    'brd-k16130065',
+    'brd-k20514654',
+    'brd-k27188169',
+    'brd-k55473186',
+    'yl54',
+    'brd-k58730230',
+    'brd-k79669418',
+    'brd-k99584050']
+
+    # Identify rows in all_drugs that match brd_list entries (case insensitive)
+    removed_drugs = all_drugs.filter(pl.col("chem_name").str.to_lowercase().is_in(brd_list))
+
+    # Store the improve_drug_id IDs of removed entries
+    improve_drug_id = removed_drugs["improve_drug_id"].to_list()
+
+    # Remove these rows from all_drugs and all_experiments
+    all_drugs = all_drugs.filter(~pl.col("improve_drug_id").is_in(improve_drug_id))
+    all_experiments = all_experiments.filter(~pl.col("improve_drug_id").is_in(improve_drug_id))
+            
+    all_drugs.write_csv("broad_sanger_drugs.tsv", separator="\t")
+    all_experiments.write_csv("broad_sanger_experiments.tsv", separator="\t")
+    
+            
+if __name__ == "__main__":
+    main()
diff --git a/build/broad_sanger/05_separate_datasets.py b/build/broad_sanger/05b_separate_datasets.py
similarity index 99%
rename from build/broad_sanger/05_separate_datasets.py
rename to build/broad_sanger/05b_separate_datasets.py
index 9e99adc9..f4d5481d 100644
--- a/build/broad_sanger/05_separate_datasets.py
+++ b/build/broad_sanger/05b_separate_datasets.py
@@ -4,7 +4,6 @@
 
 
 def main():
-
     datasets_to_process = ["CCLE", "CTRPv2", "PRISM", "GDSCv1", "GDSCv2", "FIMM", "gCSI", "NCI60"]
     omics_datatypes = ["transcriptomics","proteomics", "copy_number","mutations"] # csv 
     samples_datatypes = ["samples"] #csv
diff --git a/build/broad_sanger/build_misc.sh b/build/broad_sanger/build_misc.sh
index 2fa847f1..d5896e98 100644
--- a/build/broad_sanger/build_misc.sh
+++ b/build/broad_sanger/build_misc.sh
@@ -4,8 +4,12 @@ set -euo pipefail
 trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit 1' ERR
 
 cp /tmp/broad_sanger* .
-echo "Running 05_separate_datasets.py..."
-/opt/venv/bin/python 05_separate_datasets.py
+
+echo "Running 05a_remove_problem_drugs.py..."
+/opt/venv/bin/python 05a_remove_problem_drugs.py
+
+echo "Running 05b_separate_datasets.py..."
+/opt/venv/bin/python 05b_separate_datasets.py
 
 echo "Removing broad_sanger* files..."
 rm broad_sanger*
diff --git a/build/build_dataset.py b/build/build_dataset.py
index 9e63030b..dd585604 100644
--- a/build/build_dataset.py
+++ b/build/build_dataset.py
@@ -41,6 +41,7 @@ def process_docker(dataset,validate):
         'hcmi': ['hcmi'],
         'beataml': ['beataml'],
         'mpnst': ['mpnst'],
+        'mpnstpdx': ['mpnstpdx'],
         'cptac': ['cptac'],
         'genes': ['genes'],
         'upload': ['upload']
@@ -121,7 +122,8 @@ def process_omics(executor, dataset, should_continue):
         'mpnst': ['copy_number', 'mutations', 'proteomics', 'transcriptomics'],
         'broad_sanger': ['copy_number', 'mutations', 'proteomics', 'transcriptomics'],
         'cptac': ['copy_number', 'mutations', 'proteomics', 'transcriptomics'],
-        'hcmi': ['mutations', 'transcriptomics']
+        'hcmi': ['mutations', 'transcriptomics'],
+        'mpnstpdx':['copy_number', 'mutations', 'proteomics', 'transcriptomics']
     }
 
     expected_omics = dataset_omics_files.get(dataset, [])
@@ -216,7 +218,7 @@ def run_docker_validate_cmd(cmd_arr, all_files_dir, name):
     Wrapper for 'docker run' command used during validation and uploads.
     '''
     env = os.environ.copy()
-    docker_run = ['docker', 'run', '-v', f"{env['PWD']}/local/{all_files_dir}:/tmp"]
+    docker_run = ['docker', 'run', '-v', f"{env['PWD']}/local/{all_files_dir}:/tmp", '--platform=linux/amd64']
     docker_run.extend(['upload']) 
     docker_run.extend(cmd_arr)
     print('Executing:', ' '.join(docker_run))
@@ -256,7 +258,7 @@ def run_schema_checker(dataset):
             decompress_file(os.path.join('local', all_files_dir, file))
 
     # Run schema checker
-    schema_check_command = ['python3', 'scripts/check_schema.py', '--datasets'] + datasets
+    schema_check_command = ['python3', 'check_schema.py', '--datasets'] + datasets
     run_docker_validate_cmd(schema_check_command, all_files_dir, 'Validation')
 
 def main():
diff --git a/build/cptac/getCptacData.py b/build/cptac/getCptacData.py
index 527ff700..c3c401a2 100755
--- a/build/cptac/getCptacData.py
+++ b/build/cptac/getCptacData.py
@@ -380,11 +380,15 @@ def main():
                     dat_files[dtype_key] = fdf2
                 else:
                     dat_files[dtype_key] = fdf.dropna()
+                
                 print(dtype_key)
 
         # Now concatenate all the cancers into a single file
         for dtype_key, df in dat_files.items():
             print('Saving ' + "cptac_" + dtype_key + '.csv.gz' + ' file')
+            print(df.to_string())
+            df['entrez_id'] = df['entrez_id'].fillna(0)
+            df['entrez_id'] = df['entrez_id'].astype(int)
             df.to_csv("/tmp/" + "cptac_" + dtype_key + '.csv.gz', sep=',', index=False, compression='gzip')
 
 if __name__ == '__main__':
diff --git a/build/docker/Dockerfile.broad_sanger_omics b/build/docker/Dockerfile.broad_sanger_omics
index 2d4cbde8..43f4a428 100755
--- a/build/docker/Dockerfile.broad_sanger_omics
+++ b/build/docker/Dockerfile.broad_sanger_omics
@@ -34,7 +34,8 @@ ADD build/broad_sanger/build_samples.sh ./
 ADD build/broad_sanger/build_omics.sh ./
 ADD build/utils/* ./
 ADD build/broad_sanger/build_misc.sh ./
-ADD build/broad_sanger/05_separate_datasets.py ./
+ADD build/broad_sanger/05a_remove_problem_drugs.py ./
+ADD build/broad_sanger/05b_separate_datasets.py ./
 
 ADD build/broad_sanger/requirements.txt .
 ADD build/broad_sanger/omics_requirements.r .
diff --git a/build/docker/Dockerfile.mpnst b/build/docker/Dockerfile.mpnst
index 92fe9c6d..94e04e87 100755
--- a/build/docker/Dockerfile.mpnst
+++ b/build/docker/Dockerfile.mpnst
@@ -1,4 +1,4 @@
-FROM r-base:4.3.2
+FROM r-base:4.3.3
 
 # Set environment to noninteractive
 ENV DEBIAN_FRONTEND=noninteractive
diff --git a/build/docker/Dockerfile.mpnstPDX b/build/docker/Dockerfile.mpnstPDX
new file mode 100755
index 00000000..86a3c02e
--- /dev/null
+++ b/build/docker/Dockerfile.mpnstPDX
@@ -0,0 +1,54 @@
+FROM r-base:4.3.2
+
+# Set environment to noninteractive
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Update package list and install required packages
+RUN apt-get update && \
+    apt-get install -y build-essential wget curl libcurl4-openssl-dev libxml2-dev \
+    zlib1g-dev libssl-dev libbz2-dev libreadline-dev libsqlite3-dev libffi-dev
+
+# Download and compile Python 3.10 with shared library support
+RUN wget https://www.python.org/ftp/python/3.10.12/Python-3.10.12.tgz && \
+    tar -xf Python-3.10.12.tgz && \
+    cd Python-3.10.12 && \
+    ./configure --enable-optimizations --enable-shared && \
+    make -j$(nproc) && \
+    make altinstall && \
+    cd .. && \
+    rm -rf Python-3.10.12.tgz Python-3.10.12
+
+# Set Python 3.10 as default
+RUN ln -s /usr/local/bin/python3.10 /usr/bin/python3 && \
+    ln -s /usr/local/bin/pip3.10 /usr/bin/pip3
+
+# Update library paths for Python shared library
+RUN echo "/usr/local/lib" >> /etc/ld.so.conf.d/python3.10.conf && ldconfig
+
+# Create a Python virtual environment
+RUN python3 -m venv /opt/venv
+RUN /opt/venv/bin/pip install --upgrade pip
+
+# Set environment variables for reticulate
+ENV RETICULATE_PYTHON="/opt/venv/bin/python3"
+ENV PYTHONPATH=/app#"${PYTHONPATH}:/app"
+WORKDIR /app
+
+# Set MPLCONFIGDIR to a writable directory
+ENV MPLCONFIGDIR=/app/tmp/matplotlib
+RUN mkdir -p /app/tmp/matplotlib
+
+# Add necessary files to the container
+ADD build/mpnstpdx/requirements.txt .
+ADD build/mpnstpdx/requirements.r .
+ADD build/mpnstpdx/* ./
+ADD build/utils/* ./
+
+# installing python libraries
+RUN /opt/venv/bin/pip3 install -r requirements.txt
+
+# Install all R libraries from requirements.r
+RUN Rscript requirements.r
+
+# Set up volume for temporary storage
+VOLUME ["/tmp"]
diff --git a/build/docker/docker-compose.yml b/build/docker/docker-compose.yml
index fb9c9ab8..373eb5d6 100644
--- a/build/docker/docker-compose.yml
+++ b/build/docker/docker-compose.yml
@@ -44,7 +44,15 @@ services:
         HTTPS_PROXY: ${HTTPS_PROXY}
     platform: linux/amd64
     image: mpnst:latest
-
+    
+  mpnstpdx:
+    build:
+      context: ../../
+      dockerfile: build/docker/Dockerfile.mpnstpdx
+      args:
+        HTTPS_PROXY: ${HTTPS_PROXY}
+    platform: linux/amd64
+    image: mpnstpdx:latest
   cptac:
     build:
       context: ../../
diff --git a/build/hcmi/02-getHCMIData.py b/build/hcmi/02-getHCMIData.py
index a7e3e74a..b329a68a 100644
--- a/build/hcmi/02-getHCMIData.py
+++ b/build/hcmi/02-getHCMIData.py
@@ -17,67 +17,93 @@
 import polars as pl
 import gc
 import hashlib
+from pathlib import Path
     
 def download_tool(url):
     """
-    Download, extract, and make a tool (GDC Client) executable from the provided URL.
+    Download, extract, and prepare the GDC client tool.
 
     Parameters
     ----------
     url : str
-        The URL from where the tool needs to be downloaded.
+        The URL to download the tool from.
 
     Returns
     -------
     str
-        Name of the downloaded file.
+        The path to the `gdc-client` executable.
     """
-    
+    # Download the file
+    print("Downloading tool...")
     filename = wget.download(url)
-    files_before = os.listdir()
+    
+    # First extraction
+    print(f"\nExtracting {filename}...")
     shutil.unpack_archive(filename)
-    files_after = os.listdir()
-    new_file = str(next(iter((set(files_after) - set(files_before)))))
-    st = os.stat(new_file)
-    os.chmod(new_file, st.st_mode | stat.S_IEXEC)
-    return filename
+    os.remove(filename)
+
+    # Check for a nested zip file and extract again
+    extracted_files = [f for f in os.listdir() if os.path.isfile(f) and f.endswith(".zip")]
+    for zip_file in extracted_files:
+        print(f"Extracting nested archive: {zip_file}...")
+        shutil.unpack_archive(zip_file)
+        os.remove(zip_file)
+
+    gdc_client_path = None
+    for root, dirs, files in os.walk("."):
+        if "gdc-client" in files:
+            gdc_client_path = os.path.join(root, "gdc-client")
+            break
+
+    if not gdc_client_path:
+        raise FileNotFoundError("`gdc-client` executable not found after extraction.")
+
+    # Ensure `gdc-client` is executable
+    print(f"Making {gdc_client_path} executable...")
+    st = os.stat(gdc_client_path)
+    os.chmod(gdc_client_path, st.st_mode | stat.S_IEXEC)
+
+    return gdc_client_path
 
 def is_tool(name):
     """
-    Check if a specific tool is available on the system or in the current directory.
+    Check if a specific tool is available on the system.
 
     Parameters
     ----------
     name : str
-        The name of the tool to check.
+        The name of the tool.
 
     Returns
     -------
     bool
         True if the tool is found, otherwise False.
     """
-    
-    return which(name) is not None or name in os.listdir()
+    return shutil.which(name) is not None or name in os.listdir()
 
 def ensure_gdc_client():
     """
-    Ensure that the gdc-client is available on the system.
+    Ensure that the GDC client tool is available on the system.
     
-    If the gdc-client tool isn't found, this function will automatically
-    download the appropriate version based on the operating system.
+    If the tool isn't found, this function downloads and prepares it.
     """
-    
     tool_name = "gdc-client"
     if not is_tool(tool_name):
-        print("Downloading gdc-client")
+        print("GDC client not found. Downloading...")
         urls = {
-            "Darwin": 'https://gdc.cancer.gov/files/public/file/gdc-client_v1.6.1_OSX_x64.zip',
-            "Windows": 'https://gdc.cancer.gov/files/public/file/gdc-client_v1.6.1_Windows_x64.zip',
-            "Linux": 'https://gdc.cancer.gov/files/public/file/gdc-client_v1.6.1_Ubuntu_x64.zip'
+            "Darwin": "https://gdc.cancer.gov/system/files/public/file/gdc-client_2.3_OSX_x64-py3.8-macos-14.zip",
+            "Windows": "https://gdc.cancer.gov/system/files/public/file/gdc-client_2.3_Windows_x64-py3.8-windows-2019.zip",
+            "Linux": "https://gdc.cancer.gov/system/files/public/file/gdc-client_2.3_Ubuntu_x64-py3.8-ubuntu-20.04.zip"
         }
-        download_tool(urls.get(platform.system()))
+        os_type = platform.system()
+        url = urls.get(os_type)
+        if not url:
+            raise ValueError(f"Unsupported OS: {os_type}")
+        gdc_client_path = download_tool(url)
+        print(f"`gdc-client` downloaded and available at {gdc_client_path}")
     else:
-        print("gdc-client already installed")
+        print("`gdc-client` is already installed.")
+
 
 def extract_uuids_from_manifest(manifest_data):
     """
diff --git a/build/mpnst/01_mpnst_get_omics.R b/build/mpnst/01_mpnst_get_omics.R
index 77a5466d..a1276eff 100755
--- a/build/mpnst/01_mpnst_get_omics.R
+++ b/build/mpnst/01_mpnst_get_omics.R
@@ -34,6 +34,7 @@ samples_df <- fread(patients)|>
 
 pdx_samps<-subset(samples_df,model_type=='patient derived xenograft')
 tumor_samps<-subset(samples_df,model_type=='tumor')
+mt_samps<-subset(samples_df,model_type=='organoid')
 
 ##now get the manifest from synapse
 manifest<-synapser::synTableQuery("select * from syn53503360")$asDataFrame()|>
@@ -45,26 +46,35 @@ manifest<-synapser::synTableQuery("select * from syn53503360")$asDataFrame()|>
 ##they each get their own sample identifier
 pdx_data<-manifest|>dplyr::select(common_name,starts_with("PDX"))|>
     left_join(pdx_samps)|>
-    dplyr::select(improve_sample_id,RNASeq='PDX_RNASeq',Mutations='PDX_Somatic_Mutations',CopyNumber='PDX_CNV',Proteomics='PDX_Proteomics')
+    dplyr::select(improve_sample_id,common_name,model_type,RNASeq='PDX_RNASeq',Mutations='PDX_Somatic_Mutations',CopyNumber='PDX_CNV',Proteomics='PDX_Proteomics')|>
+        subset(!is.na(improve_sample_id))
 
 tumor_data<- manifest|>dplyr::select(common_name,starts_with("Tumor"))|>
     left_join(tumor_samps)|>
-    dplyr::select(improve_sample_id,RNASeq='Tumor_RNASeq',Mutations='Tumor_Somatic_Mutations',CopyNumber='Tumor_CNV')|>
-    mutate(Proteomics='') ##we dont have tumor proteomics from these samples
+    dplyr::select(improve_sample_id,common_name,model_type,RNASeq='Tumor_RNASeq',Mutations='Tumor_Somatic_Mutations',CopyNumber='Tumor_CNV')|>
+    mutate(Proteomics='')|>
+    subset(!is.na(improve_sample_id))
+           ##we dont have tumor proteomics from these samples
+#print(tumor_data)
+
+mt_data<- manifest|>dplyr::select(common_name,starts_with("PDX"))|>
+    left_join(mt_samps)|>
+    dplyr::select(improve_sample_id,common_name,model_type, RNASeq='PDX_RNASeq',Mutations='PDX_Somatic_Mutations',CopyNumber='PDX_CNV',Proteomics='PDX_Proteomics')|>##we dont have mt data yet, so collecting PDX instead
+    subset(!is.na(improve_sample_id))
 #print(tumor_data)
 
 
-combined<-rbind(pdx_data,tumor_data)|>distinct()
+combined<-rbind(pdx_data,tumor_data,mt_data)|>distinct()
 
 # gene mapping table
 genes_df <- fread(genefile)
 
 
 ##added proteomics first
-proteomics<-do.call('rbind',lapply(setdiff(combined$Proteomics,c('',NA,"NA")),function(x){
+proteomics<-do.call('rbind',lapply(setdiff(mt_data$Proteomics,c('',NA,"NA")),function(x){
                                         # if(x!=""){
     #print(x)
-    sample<-subset(combined,Proteomics==x)
+    sample<-subset(mt_data,Proteomics==x)
     #print(sample)
     res<-fread(synGet(x)$path)|>
         #tidyr::separate(Name,into=c('other_id','vers'),sep='\\.')|>
@@ -88,10 +98,10 @@ fwrite(proteomics,'/tmp/mpnst_proteomics.csv.gz')
 
 #### FIRST WE GET RNASeq Data
 
-rnaseq<-do.call('rbind',lapply(setdiff(combined$RNASeq,c(NA,"NA")),function(x){
+rnaseq<-do.call('rbind',lapply(setdiff(mt_data$RNASeq,c(NA,"NA")),function(x){
                                         # if(x!=""){
     #print(x)
-    sample<-subset(combined,RNASeq==x)
+    sample<-subset(mt_data,RNASeq==x)
     #print(sample)
     res<-fread(synGet(x)$path)|>
         tidyr::separate(Name,into=c('other_id','vers'),sep='\\.')|>
@@ -114,11 +124,11 @@ fwrite(rnaseq,'/tmp/mpnst_transcriptomics.csv.gz')
 
 #####NEXT WE DO WES DATA
 print("Getting WES")
-wes<-do.call(rbind,lapply(setdiff(combined$`Mutations`,c(NA,"NA")),function(x){
+wes<-do.call(rbind,lapply(setdiff(mt_data$`Mutations`,c(NA,"NA")),function(x){
 
     x2=x#gsub('"','',gsub("[",'',gsub("]",'',x,fixed=T),fixed=T),fixed=T)
     print(x)
-    sample<-subset(combined,Mutations==x)
+    sample<-subset(mt_data,Mutations==x)
     print(sample$improve_sample_id)
     res<-NULL
     try(res<-fread(synGet(x2)$path)|>
@@ -141,11 +151,11 @@ fwrite(wes,'/tmp/mpnst_mutations.csv.gz')
 
 print(paste("getting CNV"))
 ##next let's do CNVs!
-cnv<-do.call(rbind,lapply(setdiff(combined$CopyNumber,c(NA,"NA")),function(x){
+cnv<-do.call(rbind,lapply(setdiff(mt_data$CopyNumber,c(NA,"NA")),function(x){
 
     x2=x#gsub('"','',gsub("[",'',gsub("]",'',x,fixed=T),fixed=T),fixed=T)
     print(x)
-    sample<-subset(combined,CopyNumber==x)
+    sample<-subset(mt_data,CopyNumber==x)
     print(sample$improve_sample_id)
     res<-fread(synGet(x2)$path)
 
diff --git a/build/mpnst/02_get_drug_data.R b/build/mpnst/02_get_drug_data.R
index 7c07af9b..e90a31fb 100644
--- a/build/mpnst/02_get_drug_data.R
+++ b/build/mpnst/02_get_drug_data.R
@@ -103,7 +103,7 @@ if (!is.na(olddrugfiles)) {
       chem_name = character(),
       pubchem_id = character(),
       canSMILES = character(),
-      isoSMILES = character(),
+      # isoSMILES = character(),
       InChIKey = character(),
       formula = character(),
       weight = numeric(),
@@ -118,7 +118,7 @@ if (!is.na(olddrugfiles)) {
     chem_name = character(),
     pubchem_id = character(),
     canSMILES = character(),
-    isoSMILES = character(),
+    # isoSMILES = character(),
     InChIKey = character(),
     formula = character(),
     weight = numeric(),
diff --git a/build/mpnst/03_get_drug_response_data.R b/build/mpnst/03_get_drug_response_data.R
index 8f02ca80..9bbb6f00 100644
--- a/build/mpnst/03_get_drug_response_data.R
+++ b/build/mpnst/03_get_drug_response_data.R
@@ -32,7 +32,7 @@ org_samps<-subset(samples_df,model_type=='organoid')
 
 ##now get the manifest from synapse
 manifest<-synapser::synTableQuery("select * from syn53503360")$asDataFrame()|>
-                                                             as.data.frame()|>
+                                                             as.data.table()|>
                                                              dplyr::rename(common_name='Sample')
 
 
diff --git a/build/mpnst/README.md b/build/mpnst/README.md
index 10d0bcd9..13c08454 100755
--- a/build/mpnst/README.md
+++ b/build/mpnst/README.md
@@ -12,7 +12,7 @@ directory. Currently using the test files as input.
    `mpnst_samples.csv` file. This pulls from the latest synapse
    project metadata table.
    ```
-   docker run -v $PWD:/tmp -e SYNAPSE_AUTH_TOKEN=$SYNAPSE_AUTH_TOKEN mpnst sh build_samples.sh /tmp/build/build_test/test_samples.csv
+   docker run -v $PWD:/tmp -e -e SYNAPSE_AUTH_TOKEN=$SYNAPSE_AUTH_TOKEN mpnst sh build_samples.sh /tmp/build/build_test/test_samples.csv
    ```
 
 3. Pull the data and map it to the samples. This uses the metadata
diff --git a/build/mpnst/build_drugs.sh b/build/mpnst/build_drugs.sh
index 3b969d2b..821bc46c 100644
--- a/build/mpnst/build_drugs.sh
+++ b/build/mpnst/build_drugs.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
-set -euo pipefail
+#set -euo pipefail
 
-trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit 1' ERR
+#trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit 1' ERR
 
 echo "Running 02_get_drug_data.R with /tmp/mpnst_drugs.tsv and $1."
 Rscript 02_get_drug_data.R /tmp/mpnst_drugs.tsv $1
diff --git a/build/mpnst/requirements.r b/build/mpnst/requirements.r
index 72dd457d..7796236d 100755
--- a/build/mpnst/requirements.r
+++ b/build/mpnst/requirements.r
@@ -4,8 +4,8 @@ install.packages('remotes')
 remotes::install_version('rjson', version = '0.2.21', repos = 'https://cloud.r-project.org')
 install.packages('synapser', repos = c('http://ran.synapse.org', 'https://cloud.r-project.org'))
 install.packages("dplyr")
-install.packages("data.table")
 install.packages("synapser", repos = c("http://ran.synapse.org", "https://cloud.r-project.org"))
+install.packages("data.table")
 install.packages("R.utils")
 install.packages("stringr")
-install.packages("tidyr")
\ No newline at end of file
+install.packages("tidyr")
diff --git a/build/mpnstPDX/01_mpnstpdx_get_omics.R b/build/mpnstPDX/01_mpnstpdx_get_omics.R
new file mode 100755
index 00000000..86e3cbb8
--- /dev/null
+++ b/build/mpnstPDX/01_mpnstpdx_get_omics.R
@@ -0,0 +1,195 @@
+# Load required libraries
+library(data.table)
+# library(biomaRt)# biomart issues still exist
+library(synapser)
+library(dplyr)
+
+# Retrieve command line arguments
+args <- commandArgs(trailingOnly = TRUE)
+
+# Check if a token was provided
+if (length(args) == 0) {
+  stop("No token or sample file provided. Usage: Rscript my_script.R <PAT> [samples] [genes]", call. = FALSE)
+}
+
+# Set your personal access token
+PAT <- args[1]
+patients <- args[2]
+genefile <- args[3]
+
+# Log in to Synapse
+synLogin(authToken = PAT)
+
+# Define the Ensembl mart # biomart issues still exist
+# ensembl <- useMart("ensembl", dataset = "hsapiens_gene_ensembl") # biomart issues still exist; fix later...
+
+# Path to the directory to save .sf files
+#path <- "./tmp"
+#dir.create(path, showWarnings = FALSE)
+
+# Read the sample mapping CSV and genes.csv
+samples_df <- fread(patients)|>
+    dplyr::select(improve_sample_id,common_name,model_type)|>
+                                        distinct()#"mpnst/synapse_NF-MPNSTpdx_samples.csv")
+
+pdx_samps<-subset(samples_df,model_type=='patient derived xenograft')
+tumor_samps<-subset(samples_df,model_type=='tumor')
+
+##now get the manifest from synapse
+manifest<-synapser::synTableQuery("select * from syn53503360")$asDataFrame()|>
+                                                             as.data.frame()|>
+                                                             dplyr::rename(common_name='Sample')
+
+
+##for now we only have tumor and pdx data
+##they each get their own sample identifier
+pdx_data<-manifest|>dplyr::select(common_name,starts_with("PDX"))|>
+    left_join(pdx_samps)|>
+    dplyr::select(improve_sample_id,RNASeq='PDX_RNASeq',Mutations='PDX_Somatic_Mutations',CopyNumber='PDX_CNV',Proteomics='PDX_Proteomics')
+
+tumor_data<- manifest|>dplyr::select(common_name,starts_with("Tumor"))|>
+    left_join(tumor_samps)|>
+    dplyr::select(improve_sample_id,RNASeq='Tumor_RNASeq',Mutations='Tumor_Somatic_Mutations',CopyNumber='Tumor_CNV')|>
+    mutate(Proteomics='') ##we dont have tumor proteomics from these samples
+#print(tumor_data)
+
+
+pdx_data<-rbind(pdx_data,tumor_data)|>distinct()
+
+# gene mapping table
+genes_df <- fread(genefile)
+
+
+##added proteomics first
+proteomics<-do.call('rbind',lapply(setdiff(pdx_data$Proteomics,c('',NA,"NA")),function(x){
+                                        # if(x!=""){
+    #print(x)
+    sample<-subset(pdx_data,Proteomics==x)
+    #print(sample)
+    res<-fread(synGet(x)$path)|>
+        #tidyr::separate(Name,into=c('other_id','vers'),sep='\\.')|>
+                                        #dplyr::select(-vers)|>
+        dplyr::rename(gene_symbol='Gene')|>
+        left_join(genes_df)|>
+        dplyr::select(entrez_id,proteomics='logRatio')|>
+        distinct()|>
+        subset(!is.na(entrez_id))|>
+        subset(proteomics!=0)
+
+    res$improve_sample_id=rep(sample$improve_sample_id[1],nrow(res))
+    res$source=rep('NF Data Portal',nrow(res))
+    res$study=rep('MPNST PDX',nrow(res))
+    return(distinct(res))
+                                        # }
+}))
+
+fwrite(proteomics,'/tmp/mpnstpdx_proteomics.csv.gz')
+
+
+#### FIRST WE GET RNASeq Data
+
+rnaseq<-do.call('rbind',lapply(setdiff(pdx_data$RNASeq,c(NA,"NA")),function(x){
+                                        # if(x!=""){
+    #print(x)
+    sample<-subset(pdx_data,RNASeq==x)
+    #print(sample)
+    res<-fread(synGet(x)$path)|>
+        tidyr::separate(Name,into=c('other_id','vers'),sep='\\.')|>
+        dplyr::select(-vers)|>
+        left_join(genes_df)|>
+        dplyr::select(entrez_id,transcriptomics='TPM')|>
+        subset(!is.na(entrez_id))|>
+        subset(transcriptomics!=0)
+
+    res$improve_sample_id=rep(sample$improve_sample_id[1],nrow(res))
+    res$source=rep('NF Data Portal',nrow(res))
+    res$study=rep('MPNST PDX',nrow(res))
+    return(distinct(res))
+                                        # }
+}))
+
+fwrite(rnaseq,'/tmp/mpnstpdx_transcriptomics.csv.gz')
+
+
+
+#####NEXT WE DO WES DATA
+print("Getting WES")
+wes<-do.call(rbind,lapply(setdiff(pdx_data$`Mutations`,c(NA,"NA")),function(x){
+
+    x2=x#gsub('"','',gsub("[",'',gsub("]",'',x,fixed=T),fixed=T),fixed=T)
+    print(x)
+    sample<-subset(pdx_data,Mutations==x)
+    print(sample$improve_sample_id)
+    res<-NULL
+    try(res<-fread(synGet(x2)$path)|>
+            dplyr::select(entrez_id='Entrez_Gene_Id',mutation='HGVSc',variant_classification='Variant_Classification')|>
+            subset(entrez_id%in%genes_df$entrez_id)|>
+            distinct())
+    if(is.null(res))
+        return(NULL)
+
+    res$improve_sample_id=rep(sample$improve_sample_id[1],nrow(res))
+    res$source=rep('NF Data Portal',nrow(res))
+    res$study=rep('MPNST PDX',nrow(res))
+
+    return(distinct(res))
+                                        # }
+}))
+
+fwrite(wes,'/tmp/mpnstpdx_mutations.csv.gz')
+
+
+print(paste("getting CNV"))
+##next let's do CNVs!
+cnv<-do.call(rbind,lapply(setdiff(pdx_data$CopyNumber,c(NA,"NA")),function(x){
+
+    x2=x#gsub('"','',gsub("[",'',gsub("]",'',x,fixed=T),fixed=T),fixed=T)
+    print(x)
+    sample<-subset(pdx_data,CopyNumber==x)
+    print(sample$improve_sample_id)
+    res<-fread(synGet(x2)$path)
+
+    long_df<- res|>
+      tidyr::separate_rows(gene,sep=',')|>
+      dplyr::rename(gene_symbol='gene')|>
+      dplyr::left_join(genes_df)|>
+      subset(!is.na(entrez_id))|>
+      dplyr::select(entrez_id,log2)|>
+      dplyr::distinct()|>
+        dplyr::mutate(copy_number=2^log2)|>
+        dplyr::select(-log2)
+
+  res<-long_df|> ##deep del < 0.5210507 < het loss < 0.7311832 < diploid < 1.214125 < gain < 1.422233 < amp
+      dplyr::mutate(copy_call=ifelse(copy_number<0.5210507,'deep del',
+                                     ifelse(copy_number<0.7311832,'het loss',
+                                            ifelse(copy_number<1.214125,'diploid',
+                                                   ifelse(copy_number<1.422233,'gain','amp')))))|>
+    mutate(study='MPNST PDX',source='NF Data Portal',improve_sample_id=sample$improve_sample_id[1])|>
+    dplyr::distinct()
+
+    # long_df <- res[, strsplit(as.character(gene), ","), by = .(chromosome, start, end, depth, log2)]
+    # filtered_df <- long_df |>
+    #     subset(is.finite(log2))|>
+    #     filter(V1 %in% genes_df$gene) # get only protein coding genes and remove empty gene symbols
+    # filtered_df <- filtered_df[, .(gene_symbol = V1,
+    #                        improve_sample_id = sample$improve_sample_id[1],
+    #                        copy_number = 2^log2,
+    #                        source = "NF Data Portal",
+    #                        study = "MPNST PDX")]
+    # res<-filtered_df|> ##deep del < 0.5210507 < het loss < 0.7311832 < diploid < 1.214125 < gain < 1.422233 < amp
+    #     dplyr::mutate(copy_call=ifelse(copy_number<0.5210507,'deep del',
+    #                                    ifelse(copy_number<0.7311832,'het loss',
+    #                                           ifelse(copy_number<1.214125,'diploid',
+    #                                           ifelse(copy_number<1.422233,'gain','amp')))))|>
+    #     left_join(genes_df)|>
+    #     dplyr::select(entrez_id,improve_sample_id,copy_number,copy_call,study,source)|>
+    #     subset(!is.na(entrez_id))|>
+    #     distinct()
+    # res|>group_by(copy_call)|>summarize(n_distinct(entrez_id))
+    return(res)
+                                        # }
+}))
+
+fwrite(cnv,'/tmp/mpnstpdx_copy_number.csv.gz')
+
+##TODO: get proteomics!!!
diff --git a/build/mpnstPDX/02_get_drug_data.R b/build/mpnstPDX/02_get_drug_data.R
new file mode 100644
index 00000000..1f6ad47e
--- /dev/null
+++ b/build/mpnstPDX/02_get_drug_data.R
@@ -0,0 +1,120 @@
+# Load required libraries
+library(data.table)
+# library(biomaRt)# biomart issues still exist
+library(dplyr)
+library(stringr)
+library(reticulate)
+library(synapser)
+library(tidyr)
+
+
+# Retrieve command line arguments
+args <- commandArgs(trailingOnly = TRUE)
+
+# Check if a token was provided
+if (length(args) == 0) {
+  stop("No token or sample file provided. Usage: Rscript my_script.R <PAT> [olddrugfile] [newdrugfile]", call. = FALSE)
+}
+
+# Set your personal access token
+PAT <- args[1]
+olddrugfiles <- args[2]
+newdrugfile <- args[3]
+# Log in to Synapse
+synLogin(authToken = PAT)
+
+
+##now get the manifest from synapse
+manifest<-synapser::synTableQuery("select * from syn53503360")$asDataFrame()|>
+                                                             as.data.frame()|>
+                                                             dplyr::rename(common_name='Sample')
+
+
+##PDX contain list of files
+pdx<-manifest|>
+    dplyr::select(common_name,PDX_Drug_Data)|>
+    distinct()|>
+    subset(!is.na(PDX_Drug_Data))
+
+
+
+
+
+##define functions
+
+#print(pdx)
+##now loop through manifest to get all the files
+pdx_fold <- data.table(pdx)[,strsplit(as.character(PDX_Drug_Data),","), by = .(common_name)]|>
+    subset(!is.na(V1))|>
+    subset(V1!='NA')|>
+    dplyr::rename(id='V1')
+
+#print(pdx_fold)
+###this is not all of themju
+pdx_meta<-do.call(rbind,lapply(pdx_fold$id, function(x) synapser::synGetAnnotations(x)|>
+                                          as.data.frame()|>
+                                          dplyr::select('experimentalCondition')|>
+                                          dplyr::mutate(id=x)))|>
+    left_join(pdx_fold)|>
+    tidyr::separate_rows(experimentalCondition,sep=';')|>
+    mutate(chem_name=tolower(experimentalCondition))
+
+#pdx_drug <- data.table(pdx_meta)[,strsplit(as.character(experimentalCondition),';'),by= .(common_name,id)]|>
+#    mutate(drug=tolower(experimentalCondition))
+#drugs<-sapply(pdx_meta$experimentalCondition,function(x) tolower(unlist(strsplit(x,split=';'))))|>
+#    unlist()|>
+#    unique()
+
+drugs<-setdiff(pdx_meta$chem_name,'control')
+
+
+print(paste(drugs,collapse=','))
+
+
+##copy old drug to new drug
+olddrugs<-do.call(rbind,lapply(unique(unlist(strsplit(olddrugfiles,split=','))),function(x) read.table(x,header=T,sep='\t',quote='',comment.char='')))
+olddrugs<-unique(olddrugs)
+
+print(paste('Read in ',nrow(olddrugs),'old drug files'))
+
+fdrugs<-subset(olddrugs,chem_name%in%drugs)
+if(nrow(fdrugs)>0){
+    dids<-fdrugs$improve_drug_id
+}else{
+    dids<-c()
+}
+newdrugs<-subset(olddrugs,improve_drug_id%in%dids)
+
+print(paste('Found',length(dids),'improved drug ids that exist, saving those'))
+
+
+                                        #file.copy(olddrugfile,newdrugfile)
+write.table(newdrugs,file=newdrugfile,sep='\t',row.names=F,quote=FALSE,col.names=T)
+output_file_path <- newdrugfile
+ignore_file_path <- '/tmp/mpnstpdx_ignore_chems.txt'
+
+
+##now load reticulate down here
+
+
+
+use_python("/opt/venv/bin/python3", required = TRUE)
+source_python("pubchem_retrieval.py")
+
+update_dataframe_and_write_tsv(unique_names=drugs,output_filename=output_file_path,ignore_chems=ignore_file_path)
+
+
+tab<-read.table(newdrugfile,sep='\t',header=T,quote="",comment.char="")
+
+newdrugs<-tab|>
+    subset(chem_name%in%tolower(alldrugs))
+
+tab<-tab|>
+    subset(improve_drug_id%in%newdrugs$improve_drug_id)
+
+write.table(tab,file=newdrugfile,sep='\t',row.names=FALSE,quote=FALSE)
+
+
+##now call the python drug script
+
+
diff --git a/build/mpnstPDX/03_get_drug_response_data.R b/build/mpnstPDX/03_get_drug_response_data.R
new file mode 100644
index 00000000..095dba34
--- /dev/null
+++ b/build/mpnstPDX/03_get_drug_response_data.R
@@ -0,0 +1,174 @@
+# Load required libraries
+library(data.table)
+# library(biomaRt)# biomart issues still exist
+library(synapser)
+library(dplyr)
+library(stringr)
+# Retrieve command line arguments
+args <- commandArgs(trailingOnly = TRUE)
+
+# Check if a token was provided
+if (length(args) == 0) {
+  stop("No token or sample file provided. Usage: Rscript my_script.R <PAT> [samples] [drugs]", call. = FALSE)
+}
+
+# Set your personal access token
+PAT <- args[1]
+patients <- args[2]
+drugfile <- args[3]
+
+# Log in to Synapse
+synLogin(authToken = PAT)
+
+
+# Read the sample mapping CSV and genes.csv
+samples_df <- fread(patients)|>
+    dplyr::select(improve_sample_id,common_name,model_type)|>
+    distinct()#"mpnst/synapse_NF-MPNST_samples.csv")
+print(head(samples_df))
+
+pdx_samps<-subset(samples_df,model_type=='patient derived xenograft')
+org_samps<-subset(samples_df,model_type=='organoid')
+
+##now get the manifest from synapse
+manifest<-synapser::synTableQuery("select * from syn53503360")$asDataFrame()|>
+                                                             as.data.frame()|>
+                                                             dplyr::rename(common_name='Sample')
+
+
+##PDX contain list of files
+pdx<-manifest|>
+    dplyr::select(common_name,PDX_Drug_Data)|>
+    subset(!PDX_Drug_Data%in%c("NA",NA))|>
+    left_join(pdx_samps)|>
+    distinct()
+
+print(pdx)
+
+
+# Modify the extract_date_hour function to return a named vector
+extract_date_hour <- function(experiment_id) {
+  pattern <- "(\\d{6})_?(\\d{2,3})?"
+  matches <- str_match(experiment_id, pattern)
+  date <- matches[, 2]
+  hour <- matches[, 3]
+  date[is.na(date)] <- NA  # Replace with NA instead of blank
+  hour[is.na(hour)] <- 48  # Replace with 48 instead of blank (default)
+  return(list(date = date, hour = hour))
+}
+
+
+
+##define functions
+
+##first function to get children from parentId
+
+##now loop through manifest to get all the files
+#mts_fold <- data.table(mts)[,strsplit(as.character(MicroTissueDrugFolder),","), by = .(improve_sample_id,common_name)]
+
+
+
+##do the drug matching
+drug_df<-fread(drugfile)|>
+    dplyr::select('improve_drug_id','chem_name')|>
+    distinct()
+
+##update drug name PD901 since it's mussing
+##now loop through manifest to get all the files
+pdx_fold <- data.table(pdx)[,strsplit(as.character(PDX_Drug_Data),","), by = .(common_name)]|>
+    dplyr::rename(id='V1')|>
+    subset(!is.na(id))
+
+pdx_meta<-do.call(rbind,lapply(pdx_fold$id, function(x) synapser::synGetAnnotations(x)|>
+                                           as.data.frame()|>
+                                          dplyr::select('experimentalCondition')|>
+                                          dplyr::mutate(id=x)))|>left_join(pdx_fold)|>
+   # tidyr::separate_rows(experimentalCondition,sep=';')|>
+  #  mutate(chem_name=tolower(experimentalCondition))|>
+   # left_join(drug_df)|>
+    left_join(pdx_samps)|>
+    dplyr::select(improve_sample_id,id)|>
+    distinct()|>
+    subset(!is.na(id))
+pdx_meta$parentId=unlist(lapply(pdx_meta$id,function(x) synGet(x)$parentId))
+
+##the older pdx data is in separate files. the newer is not.
+#we need to reformat the older to look like the newer
+oldfolders=c('syn22018363','syn22024460','syn22024428','syn22024429','syn22024437','syn22024438')
+
+old_meta<-subset(pdx_meta,parentId%in%oldfolders)
+
+old_data<-do.call(rbind,lapply(unique(old_meta$parentId),function(x){
+    ids<-subset(old_meta,parentId==x)|>
+        subset(!is.na(id))
+
+  do.call(rbind,lapply(ids$id,function(y){
+      tab<-readr::read_csv(synapser::synGet(y)$path)
+      print(head(tab))
+      tab<-dplyr::select(tab,c('specimen_id','compound_name','dose','dose_unit',
+                             'experimental_time_point','experimental_time_point_unit',
+                             'assay_type','assay_value','assay_units'))|>
+        mutate(id=x)|>
+        mutate(chem_name=tolower(compound_name))
+
+   #   tab$single_or_combo=sapply(tab$chem_name,function(z) ifelse(length(grep('\\+',z))>0,'combo','single'))
+      tab$chem_name=gsub('n/a','control',tab$chem_name)|>
+        tidyr::replace_na('control')
+
+      tab$chem_name=sapply(tab$chem_name,function(z) ifelse(z=='doxorubinsin','doxorubicin',z))
+    #  tab<-tab|>left_join(drug_df)
+      #print(head(tab))
+      return(tab)
+       }))
+}))|>
+  left_join(unique(select(old_meta,id=parentId,improve_sample_id)))|>
+  dplyr::select(experiment=id,model_id=improve_sample_id,specimen_id,treatment=chem_name,time=experimental_time_point,time_unit=experimental_time_point_unit,volume=assay_value)|>distinct()
+
+
+
+new_meta<-subset(pdx_meta,!parentId%in%oldfolders)
+
+##now combine each of the old pdx files into single files
+#each file has all experiments in it
+new_data<-do.call(rbind,lapply(unique(new_meta$id), function(x){
+    fpath=synapser::synGet(x)$path
+    if(length(grep('xls',fpath))>0){
+        tab<-readxl::read_xlsx(fpath)
+    }else{
+        tab<-readr::read_csv(fpath)
+    }
+    print(head(tab))
+    tab<-dplyr::select(tab,c('specimen_id','compound_name','dose','dose_unit',
+                             'experimental_time_point','experimental_time_point_unit',
+                             'assay_type','assay_value','assay_units'))|>
+        mutate(id=x)
+
+   # tab$single_or_combo=sapply(tab$compound_name,function(x) ifelse(length(grep('\\+',x))>0,'combo','single'))
+    tab$compound_name=gsub('N/A','control',tab$compound_name)|>tidyr::replace_na('control')
+    tab<-tab|>
+      mutate(chem_name=tolower(compound_name))#|>
+   #   left_join(drug_df)
+    #print(head(tab))
+    return(tab)}))|>
+    left_join(pdx_meta)|>
+    dplyr::select(experiment=id,model_id=improve_sample_id,specimen_id,treatment=chem_name,time=experimental_time_point,time_unit=experimental_time_point_unit,volume=assay_value)|>distinct()
+
+##maybe tweak the data frame a bit depending on curve fitting script
+pdx_data<-rbind(old_data,new_data)
+
+#single_pdx<-subset(pdx_data,single_or_combo=='single')
+#combo_pdx<-subset(pdx_data,single_or_combo=='combo')
+#print(head(pdx_data))
+fwrite(pdx_data,'/tmp/curve_data.tsv',sep='\t')
+
+##TODO: create new curve fitting script in python
+pycmd = '/opt/venv/bin/python calc_pdx_metrics.py --input /tmp/curve_data.tsv --outprefix /tmp/mpnstpdx'
+print('running curve fitting')
+#system(pycmd)
+
+##now read in data again, separate out by single/combo, then map to drug id
+
+##mmve file name
+#file.rename('/tmp/experiments.0','/tmp/mpnstpdx_experiments.tsv')
+
+
diff --git a/build/mpnstPDX/README.md b/build/mpnstPDX/README.md
new file mode 100755
index 00000000..b0059283
--- /dev/null
+++ b/build/mpnstPDX/README.md
@@ -0,0 +1,47 @@
+## Build Instructions for MPNST PDX Dataset
+
+To build the MPNST PDX dataset, follow these steps from the coderdata root
+directory. Currently using the test files as input. 
+
+1. Build the Docker image:
+   ```
+   docker build -f build/docker/Dockerfile.mpnstpdx -t mpnstpdx . --build-arg HTTPS_PROXY=$HTTPS_PROXY
+   ```
+
+2. Generate new identifiers for these samples to create a
+   `mpnstpdx_samples.csv` file. This pulls from the latest synapse
+   project metadata table.
+   ```
+   docker run -v $PWD:/tmp -e SYNAPSE_AUTH_TOKEN=$SYNAPSE_AUTH_TOKEN mpnstpdx sh build_samples.sh /tmp/build/build_test/test_samples.csv
+   ```
+
+3. Pull the data and map it to the samples. This uses the metadata
+   table pulled above.
+   ```
+   docker run -v $PWD:/tmp -e SYNAPSE_AUTH_TOKEN=$SYNAPSE_AUTH_TOKEN mpnstpdx sh build_omics.sh /tmp/build/build_test/test_genes.csv /tmp/mpnstpdx_samples.csv 
+   ```
+
+4. Process drug data
+   ```
+   docker run -v $PWD:/tmp -e SYNAPSE_AUTH_TOKEN=$SYNAPSE_AUTH_TOKEN  mpnstpdx sh build_drugs.sh /tmp/build/build_test/test_drugs.tsv
+   ```
+   
+5. Process experiment data. This uses the metadata from above as well as the file directory on synapse:
+   ```
+   docker run -v $PWD:/tmp -e SYNAPSE_AUTH_TOKEN=$SYNAPSE_AUTH_TOKEN mpnstpdx sh build_exp.sh /tmp/mpnstpdx_samples.csv /tmp/mpnstpdx_drugs.tsv.gz
+   ```
+
+Please ensure that each step is followed in order for correct dataset compilation.
+
+## MPNST PDX Dataset Structure
+The MPNST dataset includes the following output files:
+```
+├── mpnstpdx_samples.csv
+├── mpnstpdx_transcriptomics.csv
+├── mpnstpdx_mutations.csv
+├── mpnstpdx_copy_number.csv
+├── mpnstpdx_drugs.tsv
+├── mpnstpdx_drug_descriptors.tsv.gz
+├── mpnstpdx_experiments.tsv.gz
+```
+
diff --git a/build/mpnstPDX/build_drugs.sh b/build/mpnstPDX/build_drugs.sh
new file mode 100644
index 00000000..78502bc7
--- /dev/null
+++ b/build/mpnstPDX/build_drugs.sh
@@ -0,0 +1,4 @@
+##get drug data
+Rscript 02_get_drug_data.R $SYNAPSE_AUTH_TOKEN $1 /tmp/mpnstpdx_drugs.tsv
+##get drug descriptors
+/opt/venv/bin/python3 build_drug_desc.py --drugtable /tmp/mpnstpdx_drugs.tsv --desctable /tmp/mpnstpdx_drug_descriptors.tsv.gz
\ No newline at end of file
diff --git a/build/mpnstPDX/build_exp.sh b/build/mpnstPDX/build_exp.sh
new file mode 100644
index 00000000..4e34f6b3
--- /dev/null
+++ b/build/mpnstPDX/build_exp.sh
@@ -0,0 +1,2 @@
+Rscript 03_get_drug_response_data.R $SYNAPSE_AUTH_TOKEN $1 $2
+/opt/venv/bin/python3 calc_pdx_metrics.py /tmp/curve_data.tsv --drugfile=/tmp/mpnstpdx_drugs.tsv --outprefix=/tmp/mpnstpdx
diff --git a/build/mpnstPDX/build_omics.sh b/build/mpnstPDX/build_omics.sh
new file mode 100644
index 00000000..969b4fba
--- /dev/null
+++ b/build/mpnstPDX/build_omics.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+set -euo pipefail
+
+trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit 1' ERR
+
+echo "Running 01_mpnstpdx_get_omics.R with $SYNAPSE_AUTH_TOKEN, $2, and $1."
+Rscript 01_mpnstpdx_get_omics.R $SYNAPSE_AUTH_TOKEN $2 $1
diff --git a/build/mpnstPDX/build_samples.sh b/build/mpnstPDX/build_samples.sh
new file mode 100644
index 00000000..aa88aa02
--- /dev/null
+++ b/build/mpnstPDX/build_samples.sh
@@ -0,0 +1 @@
+cp /tmp/mpnst_samples.csv /tmp/mpnstpdx_samples.csv
diff --git a/build/mpnstPDX/requirements.r b/build/mpnstPDX/requirements.r
new file mode 100755
index 00000000..e6139cd4
--- /dev/null
+++ b/build/mpnstPDX/requirements.r
@@ -0,0 +1,13 @@
+install.packages('reticulate', repos='https://cloud.r-project.org')
+reticulate::use_virtualenv('/opt/venv', required = TRUE)
+install.packages('remotes')
+remotes::install_version('rjson', version = '0.2.21', repos = 'https://cloud.r-project.org')
+install.packages('synapser', repos = c('http://ran.synapse.org', 'https://cloud.r-project.org'))
+install.packages("dplyr")
+install.packages("data.table")
+install.packages("synapser", repos = c("http://ran.synapse.org", "https://cloud.r-project.org"))
+install.packages("R.utils")
+install.packages("stringr")
+install.packages("tidyr")
+install.packages('readr')
+install.packages("readxl")
diff --git a/build/mpnstPDX/requirements.txt b/build/mpnstPDX/requirements.txt
new file mode 100755
index 00000000..b0944928
--- /dev/null
+++ b/build/mpnstPDX/requirements.txt
@@ -0,0 +1,12 @@
+pyarrow
+pandas
+matplotlib
+numpy==1.26.4
+argparse
+tqdm
+scikit-learn
+scipy
+requests
+mordredcommunity
+rdkit
+statsmodels
diff --git a/build/utils/build_drug_desc.py b/build/utils/build_drug_desc.py
index 7e25212e..f119bc77 100644
--- a/build/utils/build_drug_desc.py
+++ b/build/utils/build_drug_desc.py
@@ -82,8 +82,9 @@ def main():
     cansmiles = [a for a in set(tab.canSMILES) if str(a)!='nan']
     #    isosmiles = list(set(tab.isoSMILES))
     morgs = smiles_to_fingerprint(cansmiles)
-
+#    print(morgs)
     ids = pd.DataFrame(tab[['improve_drug_id','canSMILES']]).drop_duplicates()
+#    print(ids)
     id_morg = ids.rename({"canSMILES":'smile'},axis=1).merge(morgs)[['improve_drug_id','structural_descriptor','descriptor_value']]
 
     mords = smiles_to_mordred(cansmiles,nproc=ncors)
diff --git a/build/utils/calc_pdx_metrics.py b/build/utils/calc_pdx_metrics.py
new file mode 100755
index 00000000..c1a32d26
--- /dev/null
+++ b/build/utils/calc_pdx_metrics.py
@@ -0,0 +1,400 @@
+'''
+python script designed to compute pdx curve metrics from file
+'''
+
+
+from argparse import *
+import pandas as pd
+from tqdm import tqdm
+import numpy as np
+
+import statsmodels.api as sm
+from statsmodels.formula.api import mixedlm
+
+
+####mRECIST code
+def tumor_volume_change(volume):
+    """
+    Calculate the percent change in tumor volume relative to the initial volume.
+    
+    Parameters:
+    volume (np.ndarray or list): Array of tumor volumes.
+
+    Returns:
+    np.ndarray: Array of percent changes in tumor volume.
+    """
+    Vini = volume[0]
+    return np.array([100 * (vt - Vini) / Vini for vt in volume])
+
+def avg_response(volume_change):
+    """
+    Calculate the average response for each time point.
+    
+    Parameters:
+    volume_change (np.ndarray or list): Array of volume percent changes.
+
+    Returns:
+    np.ndarray: Array of average responses.
+    """
+    cumsum = np.cumsum(volume_change)
+    ar = cumsum / np.arange(1, len(volume_change) + 1)
+    return ar
+
+def check_numeric_int_char_zero(u):
+    """
+    Check if the input array is empty and return NA if true.
+
+    Parameters:
+    u (any): Input value or array.
+
+    Returns:
+    The input value or None if it is empty.
+    """
+    return None if len(u) == 0 else u
+
+def get_best_response(time, response, min_time=None):
+    """
+    Determine the best response after a specified minimum time.
+
+    Parameters:
+    time (np.ndarray or list): Array of time points.
+    response (np.ndarray or list): Array of response values.
+    min_time (float or None): Minimum time after which tumor volume will be considered.
+
+    Returns:
+    dict: Dictionary containing the best response time, value, and index.
+    """
+    rtz = {"time": None, "value": None, "index": None}
+
+    exdf = pd.DataFrame({"time": time, "response": response})
+    exdf = exdf.dropna()
+    exdf.reset_index()
+    if min_time is not None:
+        exdf_a = exdf[exdf['time'] >= min_time]
+
+    if exdf.shape[0] == 0:
+        return rtz
+    exdf_a.reset_index()
+ #   print(exdf.shape)
+    min_indx = list(exdf.response).index(min(exdf_a.response))
+
+    rtz['time'] = check_numeric_int_char_zero([list(exdf.time)[min_indx]])
+    rtz['value'] = check_numeric_int_char_zero([list(exdf.response)[min_indx]])
+    rtz['index'] = check_numeric_int_char_zero([min_indx])
+    return rtz
+
+def mrecist(time, volume, min_time=10, return_detail=False):
+    """
+    Compute the mRECIST for given volume response.
+
+    Parameters:
+    time (np.ndarray or list): Array of time points.
+    volume (np.ndarray or list): Array of tumor volumes.
+    min_time (float): Minimum time after which tumor volume will be considered.
+    return_detail (bool): If True, return all intermediate values.
+
+    Returns:
+    str or dict: mRECIST classification or a dictionary with detailed intermediate values.
+    """
+    if volume[0] == 0:
+        volume = np.array(volume) + 1
+
+    exdf = {
+        "volume_change": None,
+        "average_response": None,
+        "best_response": None,
+        "best_response)time": None,
+        "best_average_response": None,
+        "best_average_response_time": None,
+        "mRECIST": None
+    }
+
+    exdf["volume_change"] = tumor_volume_change(volume)
+    exdf["average_response"] = avg_response(exdf["volume_change"])
+    nxdf = {'metric':'mRESCIST','value':None}
+
+    df = pd.DataFrame({"time": time, "volume": volume})
+    df = df[df['time'] >= min_time]
+
+    if df.shape[0] < 2:
+        print(f"Warning: insufficient data after time {min_time}")
+    else:
+        br = get_best_response(time, exdf["volume_change"], min_time)
+        exdf["best_response"] = br["value"]
+        exdf["best_response_time"] = br["time"]
+
+        bar = get_best_response(time, exdf["average_response"], min_time)
+        exdf["best_average_response"] = bar["value"]
+        exdf["best_average_response_time"] = bar["time"]
+
+        best_response = exdf["best_response"][0]
+        best_average_response = exdf["best_average_response"][0]
+
+        mrecist = None
+        nxdf = {'metric':'mRESCIST','value':mrecist}
+#        exdf["mRECIST"] = mrecist
+
+        if best_response is not None and best_average_response is not None:
+            # Order of mRECIST assignment matters
+            mrecist = "PD"
+
+            if best_response < 35 and best_average_response < 30:
+                mrecist = "SD"
+
+            if best_response < -50 and best_average_response < -20:
+                mrecist = "PR"
+
+            if best_response < -95 and best_average_response < -40:
+                mrecist = "CR"
+
+#            exdf["mRECIST"] = mrecist
+            nxdf = {'metric':'mRESCIST','value':mrecist}
+    if not return_detail:
+        return nxdf
+
+    return nxdf
+
+###AUC CODE
+
+def trapz_auc(x, y):
+    """
+    Compute the area under the curve using trapezoidal rule.
+    
+    Parameters:
+    x (np.ndarray): Array of x values.
+    y (np.ndarray): Array of y values.
+    
+    Returns:
+    float: Area under the curve.
+    """
+    n = np.arange(1, len(x))
+    auc = np.dot((x[n] - x[n - 1]), (y[n] + y[n - 1])) / 2
+    return auc
+
+def AUC(time, volume, time_normalize=True):
+    """
+    Calculate the area under the curve (AUC).
+    
+    Parameters:
+    time (np.ndarray): Array of time points recorded for the experiment.
+    volume (np.ndarray): Array of volumes corresponding to the time points.
+    time_normalize (bool): If True, AUC value will be divided by max time.
+    
+    Returns:
+    dict: Dictionary containing the AUC value.
+    """
+    auc = trapz_auc(time, volume)
+    #print(time)
+    if time_normalize:
+        auc = auc/np.max(time)
+    return {"metric": "auc", "value": auc, 'time':np.max(time)}
+
+def TGI(contr_volume, treat_volume,time):
+    """
+    Computes the tumor growth inhibition (TGI) between two time-volume curves.
+
+    Parameters:
+    contr_volume (np.ndarray or list): Volume vector for control.
+    treat_volume (np.ndarray or list): Volume vector for treatment.
+
+    Returns:
+    dict: Dictionary containing the TGI value.
+    """
+    tgi = None
+    if len(contr_volume) > 0 and len(treat_volume) > 0:
+        tgi = (contr_volume[-1] - treat_volume[-1]) / (contr_volume[0] - treat_volume[0])
+    
+    # Simulated batch response class object
+    rtx = {
+        "metric": "TGI",
+        "value": tgi,
+        'time': np.max(time)
+    }
+    return rtx
+
+
+def ABC(contr_time=None, contr_volume=None, treat_time=None, treat_volume=None):
+    """
+    Compute the area between two time-volume curves.
+    
+    Parameters:
+    contr_time (np.ndarray): Array of time points for control.
+    contr_volume (np.ndarray): Array of control volumes.
+    treat_time (np.ndarray): Array of time points for treatment.
+    treat_volume (np.ndarray): Array of treatment volumes.
+    
+    Returns:
+    dict: Dictionary containing the area between curves.
+    """
+    con = {'name': 'auc', 'value': None}
+    tre = {'name': 'auc', 'value': None}
+    abc = None
+
+    if contr_time is not None and contr_volume is not None:
+        if len(contr_time) != len(contr_volume):
+            raise ValueError("contr.time and contr.volume should have same length")
+        con = AUC(contr_time, contr_volume)
+
+    if treat_time is not None and treat_volume is not None:
+        if len(treat_time) != len(treat_volume):
+            raise ValueError("treat.time and treat.volume should have same length")
+        tre = AUC(treat_time, treat_volume)
+
+    abc = con['value'] - tre['value']
+    return {"metric": "abc", "value": abc,'time':np.max(treat_time)}#, "control": con, "treatment": tre}
+
+
+###LMM CODE
+def lmm(time, volume, treatment, drug_name):
+    """
+    Compute the linear mixed model (lmm) statistics for a PDX batch.
+
+    Parameters:
+    data (pd.DataFrame): A DataFrame containing the batch data. Must contain the columns: model_id, volume, time, exp_type.
+
+    Returns:
+    dict: A dictionary containing the fit object and the specific coefficient value.
+    """
+
+    data = pd.DataFrame({'model_id':['model']*len(time),\
+                         'volume':volume,\
+                         'time':time,\
+                          'exp_type':treatment})
+
+    data = data.dropna()
+                
+    ##create data frame from these 4 vectors
+    required_columns = ["model_id", "volume", "time", "exp_type"]
+    
+    if not all(column in data.columns for column in required_columns):
+        raise ValueError("These columns must be present: 'model_id', 'volume', 'time', 'exp_type'")
+    
+    data['log_volume'] = np.log(data['volume'])
+    
+    # Define the formula for mixed linear model
+    formula = 'log_volume ~ time*exp_type'
+    
+    # Fit the model
+    model = mixedlm(formula, data, groups=data['model_id'])
+    fit = model.fit()
+    
+    # Get the coefficient for the interaction term 'time:exp_type'
+    #interaction_term = 'time:exp_type'
+#    if interaction_term in fit.params:
+#    time_coef_value = fit.params['time']
+    #print(fit.params)
+    i_coef_value = fit.params['time:exp_type[T.'+drug_name+']']
+   # else:
+   #     coef_value = None  # Handle the case when the interaction term is not present
+    
+    # Create a dictionary to store the fit object and the specific coefficient value
+    result = {
+        'fit': fit,
+        'interaction_coef_value': i_coef_value
+    }
+    
+    return {'metric': 'lmm','value': i_coef_value,'time':np.max(time)}
+
+def main():
+    parser=ArgumentParser()
+    ###read in file with model id, volume, time, condition
+    parser.add_argument('curvefile')
+    parser.add_argument('--drugfile')
+    parser.add_argument('--outprefix',default='/tmp/')
+    
+    args = parser.parse_args()
+    
+    tab = pd.read_csv(args.curvefile,sep='\t')
+    drugs = pd.read_csv(args.drugfile,sep='\t')
+    
+    singles, combos = get_drug_stats(tab)
+
+    ##join with drug ids
+    expsing = singles.rename({'drug':'chem_name','metric':'drug_combination_metric','value':'drug_combination_value','sample':'improve_sample_id'},axis=1).merge(drugs,on='chem_name',how='left')[['improve_drug_id','improve_sample_id','drug_combination_metric','drug_combination_value']]
+    expsing = expsing.dropna()
+
+    combos[['drug1','drug2']]=combos.drug.str.split('+',expand=True)
+    combos = combos.rename({'metric':'drug_combination_metric','value':'drug_combination_value','sample':'improve_sample_id'},axis=1).dropna()
+
+    expcomb = combos.rename({'drug1':'chem_name'},axis=1).merge(drugs,on='chem_name',how='left').rename({'improve_drug_id':'improve_drug_1'},axis=1)[['improve_drug_1','drug2','improve_sample_id','drug_combination_metric','drug_combination_value']]
+    expcomb = expcomb.rename({'drug2':'chem_name'},axis=1).merge(drugs,on='chem_name',how='left').rename({'improve_drug_id':'improve_drug_2'},axis=1)[['improve_drug_1','improve_drug_2','improve_sample_id','drug_combination_metric','drug_combination_value']]
+
+    expcomb[['source']]='Synapse'
+    expcomb[['study']]='MPNST PDX in vivo'
+
+    expsing[['source']]='Synapse'
+    expsing[['study']]='MPNST PDX in vivo'
+    expsing.to_csv(args.outprefix+'_experiments.csv',index=False)
+    expcomb.to_csv(args.outprefix+'_combinations.csv',index=False)
+    
+
+    
+def get_drug_stats(df,control='control'):
+    ##for each experiment, call group
+    cols = ['experiment','model_id']
+    groups = df.groupby(cols)
+    singleres = []
+    combores = []
+    
+    for name,group in tqdm(groups):
+        #each group contains multiple treatments anda  control
+        drugs = set(group.treatment)-set([control])
+        print(name[0])
+        print(drugs)
+        mod = list(set(group.model_id))[0]
+ #       print(set(group.model_id))
+        ctl_data = group[group.treatment==control]
+        ctl_time = np.array(ctl_data.time)
+        ctl_volume = np.array(ctl_data.volume)
+
+        ctl_auc = AUC(ctl_time,ctl_volume)
+        for d in drugs:
+            print(d)
+            d_data = group[group.treatment==d]
+            treat_time = np.array(d_data.time)
+            treat_volume = np.array(d_data.volume)
+
+            #get abc for group
+            treat_auc = AUC(treat_time,treat_volume)
+            treat_abc = ABC(ctl_time,ctl_volume,treat_time,treat_volume)
+            #print(f"AUC: {treat_auc}")
+            #print(f"ABC: {treat_abc}")
+            treat_abc.update({'sample':mod,'drug':d,'time_unit':'days'})
+            if '+' in d:
+                combores.append(treat_abc)
+            else:
+                singleres.append(treat_abc)
+            #lmm
+            comb = pd.concat([ctl_data,d_data])
+            lmm_res = lmm(comb.time, comb.volume, comb.treatment,d)
+            lmm_res.update({'sample':mod,'drug':d,'time_unit':'days'})
+            #print(f"LMM: {lmm_res}")
+            if '+' in d:
+                combores.append(lmm_res)
+            else:
+                singleres.append(lmm_res)
+
+            #get tgi for group
+            tg = TGI(ctl_volume,treat_volume,treat_time)
+            tg.update({'sample':mod,'drug':d,'time_unit':'days'})
+            #print(tg)
+            if '+' in d:
+                combores.append(tg)
+            else:
+                singleres.append(tg)
+
+            
+            #get mRECIST for group
+            mr = mrecist(treat_time,treat_volume)
+            mr.update({'sample':mod,'drug':d,'time_unit':'days'})
+            if '+' in d:
+                combores.append(mr)
+            else:
+                singleres.append(mr)
+
+    sing = pd.DataFrame.from_records(singleres)
+    comb = pd.DataFrame.from_records(combores)
+    return sing,comb
+
+if __name__=='__main__':
+    main()
diff --git a/build/utils/pubchem_retrieval.py b/build/utils/pubchem_retrieval.py
index e6bdc836..1cf646af 100644
--- a/build/utils/pubchem_retrieval.py
+++ b/build/utils/pubchem_retrieval.py
@@ -54,12 +54,12 @@ def retrieve_drug_info(compound,ignore_chems,isname=True):
 
     if isname:
         urls = {
-            "properties": f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{compound}/property/CanonicalSMILES,IsomericSMILES,InChIKey,MolecularFormula,MolecularWeight/JSON",
+            "properties": f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{compound}/property/CanonicalSMILES,InChIKey,MolecularFormula,MolecularWeight/JSON",
             "synonyms": f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{compound}/synonyms/JSON"
         }
     else:
         urls = {
-            "properties": f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/CID/{compound}/property/CanonicalSMILES,IsomericSMILES,InChIKey,MolecularFormula,MolecularWeight/JSON",
+            "properties": f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/CID/{compound}/property/CanonicalSMILES,InChIKey,MolecularFormula,MolecularWeight/JSON",
             "synonyms": f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/CID/{compound}/synonyms/JSON"
         }
         
@@ -188,9 +188,9 @@ def update_dataframe_and_write_tsv(unique_names, output_filename="drugs.tsv",ign
                 mode = 'a' if file_exists else 'w' 
                 with open(output_filename, mode) as f:
                     if not file_exists:
-                        f.write("improve_drug_id\tchem_name\tpubchem_id\tcanSMILES\tisoSMILES\tInChIKey\tformula\tweight\n")
+                        f.write("improve_drug_id\tchem_name\tpubchem_id\tcanSMILES\tInChIKey\tformula\tweight\n")
                     for entry in data:
-                        f.write(f"{entry['improve_drug_id']}\t{entry['name']}\t{entry.get('CID', '')}\t{entry['CanonicalSMILES']}\t{entry.get('IsomericSMILES', '')}\t{entry['InChIKey']}\t{entry['MolecularFormula']}\t{entry['MolecularWeight']}\n")
+                        f.write(f"{entry['improve_drug_id']}\t{entry['name']}\t{entry.get('CID', '')}\t{entry['CanonicalSMILES']}\t{entry['InChIKey']}\t{entry['MolecularFormula']}\t{entry['MolecularWeight']}\n")
                 
                 with open(ignore_chems,"a") as ig_f:
                     for entry in data:
diff --git a/schema/coderdata.yaml b/schema/coderdata.yaml
index 3f9dafde..ac393a35 100755
--- a/schema/coderdata.yaml
+++ b/schema/coderdata.yaml
@@ -188,13 +188,13 @@ classes:
       - source
       - study
     attributes:
-      drug_one_id:
+      drug_drug_2:
         description: improve_drug_id of first drug
-      drug_two_id:
+      improve_drug_2:
         description: imrrove_drug_id of second drug
-      combination_metric:
+      drug_combination_metric:
         description: metric calculated for synergy, or other metric of two drugs
-      combination_value:
+      drug_combination_value:
         description: value of metric for synergy or combination
 enums:
   ResponseMetric:
diff --git a/schema/expected_files.yaml b/schema/expected_files.yaml
index 6604b094..91f9770e 100644
--- a/schema/expected_files.yaml
+++ b/schema/expected_files.yaml
@@ -32,11 +32,29 @@ datasets:
       file: /tmp/mpnst_proteomics.csv
     - target_class: Mutations
       file: /tmp/mpnst_mutations.csv
+    - target_class: Copy Number
+      file: /tmp/mpnst_copy_number.csv
     - target_class: Experiments
       file: /tmp/mpnst_experiments.tsv
     - target_class: Drug
       file: /tmp/mpnst_drugs.tsv
 
+  mpnstpdx:
+    - target_class: Sample
+      file: /tmp/mpnstpdx_samples.csv
+    - target_class: Transcriptomics
+      file: /tmp/mpnstpdx_transcriptomics.csv
+    - target_class: Proteomics
+      file: /tmp/mpnstpdx_proteomics.csv
+    - target_class: Mutations
+      file: /tmp/mpnstpdx_mutations.csv
+    - target_class: Copy Number
+      file: /tmp/mpnstpdx_copy_number.csv
+    - target_class: Experiments
+      file: /tmp/mpnstpdx_experiments.tsv
+    - target_class: Drug
+      file: /tmp/mpnstpdx_drugs.tsv
+
   cptac:
     - target_class: Sample
       file: /tmp/cptac_samples.csv