From 9ddcb6f5fb384f07fde502bf936863d7858593b4 Mon Sep 17 00:00:00 2001 From: Jeremy Date: Thu, 28 Aug 2025 09:47:00 -0700 Subject: [PATCH 1/2] Drop all duplicate rows across files that had duplicate rows in 2.2 build --- coderbuild/beatAML/GetBeatAML.py | 1 + coderbuild/broad_sanger/04-drug_dosage_and_curves.py | 2 +- coderbuild/colorectal/02-omics-colorectal.py | 1 + coderbuild/liver/02-omics-liver.py | 6 ++++-- coderbuild/novartis/02-omics-novartis.py | 2 ++ coderbuild/pancreatic/02a-getPancreaticDataFromSynapse.py | 1 + coderbuild/utils/pubchem_retrieval.py | 1 + 7 files changed, 11 insertions(+), 3 deletions(-) diff --git a/coderbuild/beatAML/GetBeatAML.py b/coderbuild/beatAML/GetBeatAML.py index 27544bc4..5cb6f048 100755 --- a/coderbuild/beatAML/GetBeatAML.py +++ b/coderbuild/beatAML/GetBeatAML.py @@ -124,6 +124,7 @@ def generate_samples_file(prev_samples_path): mapping = {labId: i for i, labId in enumerate(all_samples['other_id'].unique(), start=(int(maxval)+1))} all_samples['improve_sample_id'] = all_samples['other_id'].map(mapping) all_samples.insert(1, 'improve_sample_id', all_samples.pop('improve_sample_id')) + all_samples.drop_duplicates(inplace=True) all_samples.to_csv("/tmp/beataml_samples.csv", index=False) return all_samples diff --git a/coderbuild/broad_sanger/04-drug_dosage_and_curves.py b/coderbuild/broad_sanger/04-drug_dosage_and_curves.py index ff81f26a..3b6f7245 100755 --- a/coderbuild/broad_sanger/04-drug_dosage_and_curves.py +++ b/coderbuild/broad_sanger/04-drug_dosage_and_curves.py @@ -55,7 +55,7 @@ for of in outfiles: final_file.append(pd.read_csv(of,sep='\t')) -pd.concat(final_file).to_csv('/tmp/broad_sanger_experiments.tsv',index=False,sep='\t') +pd.concat(final_file).drop_duplicates().to_csv('/tmp/broad_sanger_experiments.tsv',index=False,sep='\t') #os.system('cat *.0 > /tmp/broad_sanger_experiments.tsv') #os.system('gzip -f /tmp/experiments.tsv') diff --git a/coderbuild/colorectal/02-omics-colorectal.py b/coderbuild/colorectal/02-omics-colorectal.py index 141140f6..70e3c9e6 100644 --- a/coderbuild/colorectal/02-omics-colorectal.py +++ b/coderbuild/colorectal/02-omics-colorectal.py @@ -242,6 +242,7 @@ def map_copy_number(copy_number_data, improve_id_data, entrez_data): else: print("Starting transcriptomics data.") transcriptomics_df = map_transcriptomics(transciptomics_data = "/tmp/GSE65253_col_tum_org_merge.csv.gz", improve_id_data = "/tmp/colorectal_samples.csv", entrez_data = "/tmp/genes.csv") + transcriptomics_df.drop_duplicates(inplace=True) transcriptomics_df.to_csv("/tmp/colorectal_transcriptomics.csv", index=False) if args.mutations: diff --git a/coderbuild/liver/02-omics-liver.py b/coderbuild/liver/02-omics-liver.py index 1774fc0b..504946ce 100644 --- a/coderbuild/liver/02-omics-liver.py +++ b/coderbuild/liver/02-omics-liver.py @@ -362,6 +362,7 @@ def map_proteomics(proteomics_data, improve_id_data, entrez_data): else: print("Starting transcriptomics data.") transcriptomics_df = map_transcriptomics(transciptomics_data = "/tmp/raw_rnaseq_data.csv", improve_id_data = "/tmp/liver_samples.csv", entrez_data = "/tmp/genes.csv") + transcriptomics_df.drop_duplicates(inplace=True) transcriptomics_df.to_csv("/tmp/liver_transcriptomics.csv", index=False) if args.mutations: @@ -385,8 +386,9 @@ def map_proteomics(proteomics_data, improve_id_data, entrez_data): exit() else: print("Starting copy number data.") - mutation_df = map_copy_number(copy_number_data = "/tmp/raw_copynum_data.csv", improve_id_data = "/tmp/liver_samples.csv", entrez_data = "/tmp/genes.csv") - mutation_df.to_csv("/tmp/liver_copy_number.csv", index=False) + copy_number_df = map_copy_number(copy_number_data = "/tmp/raw_copynum_data.csv", improve_id_data = "/tmp/liver_samples.csv", entrez_data = "/tmp/genes.csv") + copy_number_df.drop_duplicates(inplace=True) + copy_number_df.to_csv("/tmp/liver_copy_number.csv", index=False) if args.proteomics: if args.genes is None or args.genes=='': diff --git a/coderbuild/novartis/02-omics-novartis.py b/coderbuild/novartis/02-omics-novartis.py index 64c98dd6..c86b096c 100644 --- a/coderbuild/novartis/02-omics-novartis.py +++ b/coderbuild/novartis/02-omics-novartis.py @@ -295,6 +295,7 @@ def map_mutations_novPDX(mutation_data, improve_id_data, entrez_data): else: print("Starting transcriptomics data.") transcriptomics_df_final = map_transcriptomics_novPDX(transcriptomics_data = "/tmp/raw_rnaseq_data.csv", improve_id_data = "/tmp/novartis_samples.csv", entrez_data = "/tmp/genes.csv") + transcriptomics_df_final.drop_duplicates(inplace=True) transcriptomics_df_final.to_csv("/tmp/novartis_transcriptomics.csv", index=False) if args.mutations: @@ -307,6 +308,7 @@ def map_mutations_novPDX(mutation_data, improve_id_data, entrez_data): else: print("Starting mutations data.") mutation_df_final = map_mutations_novPDX(mutation_data = "/tmp/raw_mutation_data.csv", improve_id_data = "/tmp/novartis_samples.csv", entrez_data = "/tmp/genes.csv") + mutation_df_final.drop_duplicates(inplace=True) mutation_df_final.to_csv("/tmp/novartis_mutations.csv", index=False) if args.copy_number: diff --git a/coderbuild/pancreatic/02a-getPancreaticDataFromSynapse.py b/coderbuild/pancreatic/02a-getPancreaticDataFromSynapse.py index 0d337196..06b2bfdc 100644 --- a/coderbuild/pancreatic/02a-getPancreaticDataFromSynapse.py +++ b/coderbuild/pancreatic/02a-getPancreaticDataFromSynapse.py @@ -165,6 +165,7 @@ def main(): res = parseMutFile(path,sampid, genes) alldats.append(res) newmut = pd.concat(alldats) + newmut.drop_duplicates(inplace=True) newmut.to_csv("/tmp/pancreatic_mutations.csv.gz",compression='gzip',index=False) #pd.DataFrame(missingsamples).to_csv('missing.csv',index=False,quoting=None,header=False) if __name__=='__main__': diff --git a/coderbuild/utils/pubchem_retrieval.py b/coderbuild/utils/pubchem_retrieval.py index b700271f..a72c5c68 100644 --- a/coderbuild/utils/pubchem_retrieval.py +++ b/coderbuild/utils/pubchem_retrieval.py @@ -441,6 +441,7 @@ def update_dataframe_and_write_tsv(unique_names, final_df = pd.DataFrame(columns=combined.columns) # --- 10) write final filtered output --- + final_df.drop_duplicates(inplace=True) final_df.to_csv(output_filename, sep="\t", index=False) if os.path.exists(temp_file): From 31d194b60c0981787b68941175d2016fe89a3704 Mon Sep 17 00:00:00 2001 From: Jeremy Date: Thu, 28 Aug 2025 09:50:18 -0700 Subject: [PATCH 2/2] Added last files for deduplication --- coderbuild/broad_sanger/02-broadSangerOmics.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/coderbuild/broad_sanger/02-broadSangerOmics.R b/coderbuild/broad_sanger/02-broadSangerOmics.R index 7a11ae1f..bfd5780d 100755 --- a/coderbuild/broad_sanger/02-broadSangerOmics.R +++ b/coderbuild/broad_sanger/02-broadSangerOmics.R @@ -668,9 +668,9 @@ main<-function(){ lapply(alltypes,function(dt){ print(dt) - temps<-sanger_files(sanger_filenames[[dt]],dt)|>tidyr::drop_na() + temps<-sanger_files(sanger_filenames[[dt]],dt)|>tidyr::drop_na()|>dplyr::distinct() readr::write_csv(temps,file=paste0('/tmp/sanger_',dt,'.csv.gz')) - tempd<-depmap_files(depmap_filenames[[dt]],dt)|>tidyr::drop_na() + tempd<-depmap_files(depmap_filenames[[dt]],dt)|>tidyr::drop_na()|>dplyr::distinct() readr::write_csv(tempd,file=paste0('/tmp/broad_',dt,'.csv.gz')) # readr::write_csv(rbind(tempd,temps),file=paste0('/tmp/broad_sanger_',dt,'.csv.gz'))