# Dataset Documentation for Coderdata
### Table of Contents:
* Publication of Origin
* Data Modalities Available
* Data Transformations
* Samples x Drugs combinations
  

## Publication of Origin
BeatAML -  
bladderpdo
broad_sanger
cptac
crcpdo
hcmi
lincs
liverpdo
mpnst
novartispdx
pancpdo
sarcpdo

In [51]:
from pathlib import Path
from copy import deepcopy

import coderdata as cd

import matplotlib.pyplot as plt
import math
import os
import glob

import pandas as pd
import numpy as np

In [52]:
# Load in all Datasets
dataset_names = ['hcmi','beataml','mpnst','pancreatic','cptac','sarcoma','colorectal','bladder','liver','novartis']
broad_sanger_datasets = ["ccle","ctrpv2","fimm","gdscv1","gdscv2","gcsi","prism","nci60"]
all_datasets = {}

### Reading in # of Samples and Drugs

In [53]:
# samples first
for dataset in dataset_names + broad_sanger_datasets:
    sample_file = glob.glob("/Users/imal967/pnnl/projects/coderdata/all_datasets/{}_samples.*".format(dataset))
    if sample_file:
        samples_df = pd.read_csv(sample_file[0], sep="\t" if sample_file[0].endswith(".tsv") else ",")
        print(samples_df['improve_sample_id'].nunique())
    else:
        print(f"No sample file found for dataset: {dataset}")

1711
1022
50
49
1139
36
61
134
62
386
502
846
52
984
806
569
478
83


In [56]:
# drugs next
for dataset in dataset_names + broad_sanger_datasets:
    drug_file = glob.glob("/Users/imal967/pnnl/projects/coderdata/all_datasets/{}_drugs.*".format(dataset))
    if drug_file:
        print(drug_file[0])
        drugs_df = pd.read_csv(drug_file[0], sep="\t" if drug_file[0].endswith(".tsv") else ",")
        print(drugs_df['improve_drug_id'].nunique())
    else:
        print(f"No drug file found for dataset: {dataset}")

No drug file found for dataset: hcmi
/Users/imal967/pnnl/projects/coderdata/all_datasets/beataml_drugs.tsv
164
/Users/imal967/pnnl/projects/coderdata/all_datasets/mpnst_drugs.tsv
30
/Users/imal967/pnnl/projects/coderdata/all_datasets/pancreatic_drugs.tsv
25
No drug file found for dataset: cptac
/Users/imal967/pnnl/projects/coderdata/all_datasets/sarcoma_drugs.tsv
34
/Users/imal967/pnnl/projects/coderdata/all_datasets/colorectal_drugs.tsv
10
/Users/imal967/pnnl/projects/coderdata/all_datasets/bladder_drugs.tsv
50
/Users/imal967/pnnl/projects/coderdata/all_datasets/liver_drugs.tsv
76
/Users/imal967/pnnl/projects/coderdata/all_datasets/novartis_drugs.tsv
25
/Users/imal967/pnnl/projects/coderdata/all_datasets/ccle_drugs.tsv
24
/Users/imal967/pnnl/projects/coderdata/all_datasets/ctrpv2_drugs.tsv
459
/Users/imal967/pnnl/projects/coderdata/all_datasets/fimm_drugs.tsv
52
/Users/imal967/pnnl/projects/coderdata/all_datasets/gdscv1_drugs.tsv
294
/Users/imal967/pnnl/projects/coderdata/all_datasets

In [57]:
# now getting unique sample-drug pairs in the experimental data
for dataset in dataset_names + broad_sanger_datasets:
    exp_file = glob.glob("/Users/imal967/pnnl/projects/coderdata/all_datasets/{}_experiments.*".format(dataset))
    if exp_file:
        exp_df = pd.read_csv(exp_file[0], sep="\t" if exp_file[0].endswith(".tsv") else ",")
        print(exp_df[['improve_sample_id','improve_drug_id']].drop_duplicates().shape[0])
    else:
        print(f"No experimental data file found for dataset: {dataset}")

No experimental data file found for dataset: hcmi
31926
272
190
No experimental data file found for dataset: cptac
275
140
3300
4453
1766
11543
309401
2663
247753
115440
13398
638983
2960756


In [58]:
# now doing rnaseq x drugs
# going to need rnaseq data and experimental data
for dataset in dataset_names + broad_sanger_datasets:
    exp_file = glob.glob("/Users/imal967/pnnl/projects/coderdata/all_datasets/{}_experiments.*".format(dataset))
    transcriptomics_file = glob.glob("/Users/imal967/pnnl/projects/coderdata/all_datasets/{}_transcriptomics.*".format(dataset))
    if transcriptomics_file:
        transcriptomics_df = pd.read_csv(transcriptomics_file[0], sep="\t" if transcriptomics_file[0].endswith(".tsv") else ",")
        all_transcriptomics_samples = transcriptomics_df['improve_sample_id'].unique() # unique samples with transcriptomics data
    if exp_file:
        exp_df = pd.read_csv(exp_file[0], sep="\t" if exp_file[0].endswith(".tsv") else ",")
        # now filter to only those samples with transcriptomics data
        exp_transcriptomics_df = exp_df[exp_df['improve_sample_id'].isin(all_transcriptomics_samples)]
        number_of_unique_sample_drug_pairs = exp_transcriptomics_df[['improve_sample_id','improve_drug_id']].drop_duplicates().shape[0]
        print(exp_transcriptomics_df[['improve_sample_id','improve_drug_id']].drop_duplicates())
        print(number_of_unique_sample_drug_pairs)
    else:
        print(f"No experimental data file found for dataset: {dataset}")

No experimental data file found for dataset: hcmi
       improve_sample_id improve_drug_id
0                   2771       SMI_11173
1                   2771       SMI_11676
2                   2771       SMI_14986
3                   2771       SMI_16677
4                   2771       SMI_17300
...                  ...             ...
25192               2780        SMI_5869
25193               2780        SMI_6359
25194               2780        SMI_7087
25195               2780        SMI_7916
25196               2780        SMI_9857

[4137 rows x 2 columns]
4137
      improve_sample_id improve_drug_id
0                  3824       SMI_13126
3                  3824       SMI_16729
6                  3824       SMI_17006
9                  3824       SMI_17091
11                 3824       SMI_17210
...                 ...             ...
3000               3816       SMI_13126
3004               3816        SMI_6182
3008               3816       SMI_56074
3012               3821     

In [59]:
# rnaseq x mutations x drugs 
# loading in mutation data
for dataset in dataset_names:
    mutation_file = glob.glob("/Users/imal967/pnnl/projects/coderdata/all_datasets/{}_mutations.*".format(dataset))
    if mutation_file:
        mutations_df = pd.read_csv(mutation_file[0], sep="\t" if mutation_file[0].endswith(".tsv") else ",")
        all_mutation_samples = mutations_df['improve_sample_id'].unique() # unique samples with mutation data
    exp_file = glob.glob("/Users/imal967/pnnl/projects/coderdata/all_datasets/{}_experiments.*".format(dataset))
    transcriptomics_file = glob.glob("/Users/imal967/pnnl/projects/coderdata/all_datasets/{}_transcriptomics.*".format(dataset))
    if transcriptomics_file:
        transcriptomics_df = pd.read_csv(transcriptomics_file[0], sep="\t" if transcriptomics_file[0].endswith(".tsv") else ",")
        all_transcriptomics_samples = transcriptomics_df['improve_sample_id'].unique() # unique samples with transcriptomics data
    if exp_file:  
        exp_df = pd.read_csv(exp_file[0], sep="\t" if exp_file[0].endswith(".tsv") else ",")
        # now filter to only those samples with transcriptomics data and mutation data
        exp_transcriptomics_mutation_df = exp_df[exp_df['improve_sample_id'].isin(all_transcriptomics_samples) & exp_df['improve_sample_id'].isin(all_mutation_samples)]
        number_of_unique_sample_drug_pairs = exp_transcriptomics_mutation_df[['improve_sample_id','improve_drug_id']].drop_duplicates().shape[0]
        print(number_of_unique_sample_drug_pairs)
    else:
        print(f"No experimental data file found for dataset: {dataset}")

No experimental data file found for dataset: hcmi
3958
184
185
No experimental data file found for dataset: cptac
187
60
640
4453
1734


In [84]:
# adding to the dataframe
to_add = []
for dataset in dataset_names + broad_sanger_datasets:
    sample_file = glob.glob("/Users/imal967/pnnl/projects/coderdata/all_datasets/{}_samples.*".format(dataset))
    drug_file = glob.glob("/Users/imal967/pnnl/projects/coderdata/all_datasets/{}_drugs.*".format(dataset))
    exp_file = glob.glob("/Users/imal967/pnnl/projects/coderdata/all_datasets/{}_experiments.*".format(dataset))
    transcriptomics_file = glob.glob("/Users/imal967/pnnl/projects/coderdata/all_datasets/{}_transcriptomics.*".format(dataset))
    mutation_file = glob.glob("/Users/imal967/pnnl/projects/coderdata/all_datasets/{}_mutations.*".format(dataset))
    copy_number_file = glob.glob("/Users/imal967/pnnl/projects/coderdata/all_datasets/{}_copy_number.*".format(dataset)) 
    # getting number of samples
    # if sample_file:
    #     samples_df = pd.read_csv(sample_file[0], sep="\t" if sample_file[0].endswith(".tsv") else ",")
    #     num_samples = samples_df['improve_sample_id'].nunique() # unique samples
    # else:
    #     print(f"Missing files for dataset: {dataset}")
    #     num_samples = np.nan
    # # getting number of drugs
    # if drug_file:
    #     drugs_df = pd.read_csv(drug_file[0], sep="\t" if drug_file[0].endswith(".tsv") else ",")
    #     num_drugs = drugs_df['improve_drug_id'].nunique() # unique drugs
    # else:
    #     print(f"Missing files for dataset: {dataset}")
    #     num_drugs = np.nan
    # getting number of unique sample-drug pairs
    if exp_file:
        exp_df = pd.read_csv(exp_file[0], sep="\t" if exp_file[0].endswith(".tsv") else ",")
        num_sample_drug_pairs = exp_df[['improve_sample_id','improve_drug_id']].drop_duplicates().shape[0]
    else:
        num_sample_drug_pairs = np.nan
    # getting number of unique sample-drug pairs with transcriptomics data
    if transcriptomics_file and exp_file:
        transcriptomics_df = pd.read_csv(transcriptomics_file[0], sep="\t" if transcriptomics_file[0].endswith(".tsv") else ",")
        all_transcriptomics_samples = transcriptomics_df['improve_sample_id'].unique() # unique samples with transcriptomics data
        # now filter to only those samples with transcriptomics data
        exp_transcriptomics_df = exp_df[exp_df['improve_sample_id'].isin(all_transcriptomics_samples)]
        num_sample_drug_transcript_pairs = exp_transcriptomics_df[['improve_sample_id','improve_drug_id']].drop_duplicates().shape[0]
    else:
        num_sample_drug_transcript_pairs = np.nan
    # unique sample-drug pairs with mutation data and transcriptomics data
    if transcriptomics_file and mutation_file and exp_file:
        mutations_df = pd.read_csv(mutation_file[0], sep="\t" if mutation_file[0].endswith(".tsv") else ",")
        all_mutation_samples = mutations_df['improve_sample_id'].unique() # unique samples with mutation data
        all_transcriptomics_samples = transcriptomics_df['improve_sample_id'].unique() # unique samples with transcriptomics data
        # now filter to only those samples with transcriptomics data and mutation data
        exp_transcriptomics_mutation_df = exp_df[exp_df['improve_sample_id'].isin(all_transcriptomics_samples) & exp_df['improve_sample_id'].isin(all_mutation_samples)]
        num_sample_drug_transcript_mutation_pairs = exp_transcriptomics_mutation_df[['improve_sample_id','improve_drug_id']].drop_duplicates().shape[0]
    else:
        num_sample_drug_transcript_mutation_pairs = np.nan
    # unique sample-drug pairs with copynum data and transcriptomics data
    if transcriptomics_file and copy_number_file and exp_file:
        copy_number_df = pd.read_csv(copy_number_file[0], sep="\t" if copy_number_file[0].endswith(".tsv") else ",")
        all_copy_number_samples = copy_number_df['improve_sample_id'].unique() # unique samples with copy number data
        all_transcriptomics_samples = transcriptomics_df['improve_sample_id'].unique() # unique samples with transcriptomics data
        # now filter to only those samples with transcriptomics data and copy number data
        exp_transcriptomics_copy_number_df = exp_df[exp_df['improve_sample_id'].isin(all_transcriptomics_samples) & exp_df['improve_sample_id'].isin(all_copy_number_samples)]
        num_sample_drug_transcript_copynum_pairs = exp_transcriptomics_copy_number_df[['improve_sample_id','improve_drug_id']].drop_duplicates().shape[0]
    else:
        num_sample_drug_transcript_copynum_pairs = np.nan
    # unique sample-drug pairs with mutation data and copynum data
    if mutation_file and copy_number_file and exp_file:
        all_mutation_samples = mutations_df['improve_sample_id'].unique()
        all_copy_number_samples = copy_number_df['improve_sample_id'].unique()
        exp_mutation_copy_number_df = exp_df[exp_df['improve_sample_id'].isin(all_mutation_samples) & exp_df['improve_sample_id'].isin(all_copy_number_samples)]
        num_sample_drug_mutation_copynum_pairs = exp_mutation_copy_number_df[['improve_sample_id','improve_drug_id']].drop_duplicates().shape[0]
    else:
        num_sample_drug_mutation_copynum_pairs = np.nan
    # adding to dataframe
    to_add.append([dataset, num_sample_drug_pairs, num_sample_drug_transcript_pairs, num_sample_drug_transcript_mutation_pairs, num_sample_drug_transcript_copynum_pairs, num_sample_drug_mutation_copynum_pairs])
    #print(to_add)


[['hcmi', nan, nan, nan, nan, nan]]
[['hcmi', nan, nan, nan, nan, nan], ['beataml', 31926, 4137, 3958, nan, nan]]
[['hcmi', nan, nan, nan, nan, nan], ['beataml', 31926, 4137, 3958, nan, nan], ['mpnst', 272, 193, 184, 191, 184]]
[['hcmi', nan, nan, nan, nan, nan], ['beataml', 31926, 4137, 3958, nan, nan], ['mpnst', 272, 193, 184, 191, 184], ['pancreatic', 190, 190, 185, 185, 185]]
[['hcmi', nan, nan, nan, nan, nan], ['beataml', 31926, 4137, 3958, nan, nan], ['mpnst', 272, 193, 184, 191, 184], ['pancreatic', 190, 190, 185, 185, 185], ['cptac', nan, nan, nan, nan, nan]]
[['hcmi', nan, nan, nan, nan, nan], ['beataml', 31926, 4137, 3958, nan, nan], ['mpnst', 272, 193, 184, 191, 184], ['pancreatic', 190, 190, 185, 185, 185], ['cptac', nan, nan, nan, nan, nan], ['sarcoma', 275, 234, 187, nan, nan]]
[['hcmi', nan, nan, nan, nan, nan], ['beataml', 31926, 4137, 3958, nan, nan], ['mpnst', 272, 193, 184, 191, 184], ['pancreatic', 190, 190, 185, 185, 185], ['cptac', nan, nan, nan, nan, nan], ['sarc

In [85]:
to_add

[['hcmi', nan, nan, nan, nan, nan],
 ['beataml', 31926, 4137, 3958, nan, nan],
 ['mpnst', 272, 193, 184, 191, 184],
 ['pancreatic', 190, 190, 185, 185, 185],
 ['cptac', nan, nan, nan, nan, nan],
 ['sarcoma', 275, 234, 187, nan, nan],
 ['colorectal', 140, 60, 60, 60, 140],
 ['bladder', 3300, 840, 640, 640, 3100],
 ['liver', 4453, 4453, 4453, 4453, 4453],
 ['novartis', 1766, 1734, 1734, 1723, 1723],
 ['ccle', 11543, 10887, 10792, 10887, 11118],
 ['ctrpv2', 309401, 300507, 295742, 299698, 300616],
 ['fimm', 2663, 2457, 2457, 2457, 2611],
 ['gdscv1', 247753, 245220, 241999, 241240, 242570],
 ['gdscv2', 115440, 114373, 112829, 112523, 113133],
 ['gcsi', 13398, 12506, 12338, 12506, 13112],
 ['prism', 638983, 632078, 630672, 632078, 636226],
 ['nci60', 2960756, 2329149, 2329132, 2329149, 2784474]]

In [86]:
datset_stats = pd.DataFrame(to_add, columns = ['dataset','sample_drug_pairs','sample_drug_transcript_pairs','sample_drug_transcriptomics_mutation_pairs','sample_drug_transcriptomics_copynumber_pairs','sample_drug_mutation_copynumber_pairs'])


In [87]:
datset_stats.sort_values(by='dataset', inplace=True)
datset_stats

Unnamed: 0,dataset,sample_drug_pairs,sample_drug_transcript_pairs,sample_drug_transcriptomics_mutation_pairs,sample_drug_transcriptomics_copynumber_pairs,sample_drug_mutation_copynumber_pairs
1,beataml,31926.0,4137.0,3958.0,,
7,bladder,3300.0,840.0,640.0,640.0,3100.0
10,ccle,11543.0,10887.0,10792.0,10887.0,11118.0
6,colorectal,140.0,60.0,60.0,60.0,140.0
4,cptac,,,,,
11,ctrpv2,309401.0,300507.0,295742.0,299698.0,300616.0
12,fimm,2663.0,2457.0,2457.0,2457.0,2611.0
15,gcsi,13398.0,12506.0,12338.0,12506.0,13112.0
13,gdscv1,247753.0,245220.0,241999.0,241240.0,242570.0
14,gdscv2,115440.0,114373.0,112829.0,112523.0,113133.0


In [88]:
datset_stats.to_csv("../docs/source/_static/dataset_summary_statistics.csv", index=False)

## Drug Curve Metrics Collected
This looks at the experiments results and shows how many drugs we have for each curve metric (found in the dose_response_metric column).

In [63]:
curve_metric_stats = pd.DataFrame(columns=['dataset','curve_metric','num_drugs'])
for dataset in dataset_names + broad_sanger_datasets:
    exp_file = glob.glob("/Users/imal967/pnnl/projects/coderdata/all_datasets/{}_experiments.*".format(dataset))
    if exp_file:
        exp_df = pd.read_csv(exp_file[0], sep="\t" if exp_file[0].endswith(".tsv") else ",")
        exp_df = exp_df[['improve_drug_id','dose_response_metric']].drop_duplicates()
        curve_metric_counts = exp_df.groupby('dose_response_metric')['improve_drug_id'].nunique().reset_index()
        curve_metric_counts['dataset'] = dataset
        curve_metric_stats = pd.concat([curve_metric_stats, curve_metric_counts.rename(columns={'dose_response_metric':'curve_metric','improve_drug_id':'num_drugs'})], ignore_index=True)
    else:
        print(f"No experimental data file found for dataset: {dataset}")


No experimental data file found for dataset: hcmi
No experimental data file found for dataset: cptac


In [64]:
curve_metric_stats

Unnamed: 0,dataset,curve_metric,num_drugs
0,beataml,aac,138
1,beataml,auc,138
2,beataml,dss,138
3,beataml,fit_auc,138
4,beataml,fit_ec50,138
...,...,...,...
144,nci60,fit_ec50se,55157
145,nci60,fit_einf,55157
146,nci60,fit_hs,55157
147,nci60,fit_ic50,55157


In [65]:
presence_absence = (
    curve_metric_stats.assign(present="X")
      .pivot_table(index="dataset", columns="curve_metric", values="present", aggfunc="first", fill_value="")
      .reset_index()
)
presence_absence = presence_absence.merge(datset_stats[['dataset','num_drugs']], on='dataset')
sorted_columns = ['dataset','num_drugs'] + sorted([col for col in presence_absence.columns if col not in ['dataset','num_drugs','TGI']]) + ['TGI']
presence_absence = presence_absence[sorted_columns]
presence_absence

Unnamed: 0,dataset,num_drugs,aac,abc,auc,dss,fit_auc,fit_ec50,fit_ec50se,fit_einf,fit_hs,fit_ic50,fit_r2,lmm,mRESCIST,published_auc,TGI
0,beataml,164,X,,X,X,X,X,X,X,X,X,X,,,,
1,bladder,50,X,,X,X,X,X,X,X,X,X,X,,,,
2,ccle,24,X,,X,X,X,X,X,X,X,X,X,,,,
3,colorectal,10,X,,X,X,X,X,X,X,X,X,X,,,,
4,ctrpv2,459,X,,X,X,X,X,X,X,X,X,X,,,,
5,fimm,52,X,,X,X,X,X,X,X,X,X,X,,,,
6,gcsi,44,X,,X,X,X,X,X,X,X,X,X,,,,
7,gdscv1,294,X,,X,X,X,X,X,X,X,X,X,,,,
8,gdscv2,171,X,,X,X,X,X,X,X,X,X,X,,,,
9,liver,76,X,,X,X,X,X,X,X,X,X,X,,,,


In [66]:
presence_absence.to_csv("../docs/source/_static/dataset_curve_metrics_wide.csv", index=False)

# Dataset Modalities Overview DF

In [76]:
datset_modalities_df = pd.DataFrame(columns=["Dataset", "References", "Sample", "Transcriptomics", "Proteomics", "Mutations", "Copy Number", "Drug", "Drug Descriptor", "Experiments"])
list_of_rows = list()

for dataset in dataset_names + broad_sanger_datasets:
    sample_file = glob.glob("/Users/imal967/pnnl/projects/coderdata/all_datasets/{}_samples.*".format(dataset))
    drug_file = glob.glob("/Users/imal967/pnnl/projects/coderdata/all_datasets/{}_drugs.*".format(dataset))
    exp_file = glob.glob("/Users/imal967/pnnl/projects/coderdata/all_datasets/{}_experiments.*".format(dataset))
    transcriptomics_file = glob.glob("/Users/imal967/pnnl/projects/coderdata/all_datasets/{}_transcriptomics.*".format(dataset))
    mutation_file = glob.glob("/Users/imal967/pnnl/projects/coderdata/all_datasets/{}_mutations.*".format(dataset))
    copy_number_file = glob.glob("/Users/imal967/pnnl/projects/coderdata/all_datasets/{}_copy_number.*".format(dataset))
    # getting number of samples
    if sample_file:
        samples_df = pd.read_csv(sample_file[0], sep="\t" if sample_file[0].endswith(".tsv") else ",")
        num_samples = samples_df['improve_sample_id'].nunique() # unique samples
    else:
        print(f"Missing files for dataset: {dataset}")
        num_samples = np.nan
    # getting number of drugs
    if drug_file:
        drugs_df = pd.read_csv(drug_file[0], sep="\t" if drug_file[0].endswith(".tsv") else ",")
        num_drugs = drugs_df['improve_drug_id'].nunique() # unique drugs
    else:
        print(f"Missing files for dataset: {dataset}")
        num_drugs = np.nan
    if exp_file:
        exp_present = "X"
    else:
        exp_present = ""
    if transcriptomics_file:
        transcriptomics_present = "X"
    else:
        transcriptomics_present = ""
    if mutation_file:
        mutation_present = "X"
    else:
        mutation_present = ""
    if copy_number_file:
        copy_number_present = "X"
    else:
        copy_number_present = ""
    print(dataset, num_samples, num_drugs, exp_present, transcriptomics_present, mutation_present, copy_number_present)
    # list_of_rows = list_of_rows.append({'dataset':[dataset],'sample_drug_pairs':[num_sample_drug_pairs],'sample_drug_transcriptomics_pairs':num_sample_drug_transcript_pairs,'sample_drug_transcriptomics_mutation_pairs':num_sample_drug_transcript_mutation_pairs,'sample_drug_transcriptomics_copynumberpairs':num_sample_drug_transcript_copynum_pairs,'sample_drug_mutation_copynumber_pairs':num_sample_drug_mutation_copynum_pairs})



Missing files for dataset: hcmi
hcmi 1711 nan  X X X
beataml 1022 164 X X X 
mpnst 50 30 X X X X
pancreatic 49 25 X X X X
Missing files for dataset: cptac
cptac 1139 nan  X X X
sarcoma 36 34 X X X 
colorectal 61 10 X X X X
bladder 134 50 X X X X
liver 62 76 X X X X
novartis 386 25 X X X X
ccle 502 24 X X X X
ctrpv2 846 459 X X X X
fimm 52 52 X X X X
gdscv1 984 294 X X X X
gdscv2 806 171 X X X X
gcsi 569 44 X X X X
prism 478 1419 X X X X
nci60 83 55157 X X X X


In [68]:
datset_stats

Unnamed: 0,Dataset,References,Sample,Transcriptomics,Proteomics,Mutations,Copy Number,Drug,Drug Descriptor,Experiments
