# Dataset Documentation for Coderdata
### Table of Contents:
* Publication of Origin
* Data Modalities Available
* Data Transformations
* Samples x Drugs combinations
  

## Publication of Origin
BeatAML -  
bladderpdo
broad_sanger
cptac
crcpdo
hcmi
lincs
liverpdo
mpnst
novartispdx
pancpdo
sarcpdo

In [2]:
from pathlib import Path
from copy import deepcopy

import coderdata as cd

import matplotlib.pyplot as plt
import math
import os
import glob

import pandas as pd
import numpy as np



In [3]:
# Load in all Datasets
dataset_names = ['hcmi','beataml','mpnst','pancpdo','cptac','sarcpdo','colorectal','bladderpdo','liver','novartis']
broad_sanger_datasets = ["ccle","ctrpv2","fimm","gdscv1","gdscv2","gcsi","prism","nci60"]
all_datasets = {}

### Reading in # of Samples and Drugs

In [4]:
# samples first
for dataset in dataset_names + broad_sanger_datasets:
    sample_file = glob.glob("/Users/imal967/pnnl/projects/coderdata/all_datasets/{}_samples.*".format(dataset))
    if sample_file:
        samples_df = pd.read_csv(sample_file[0], sep="\t" if sample_file[0].endswith(".tsv") else ",")
        print(samples_df['improve_sample_id'].nunique())
    else:
        print(f"No sample file found for dataset: {dataset}")

886
1022
50
70
1139
36
61
134
62
386
502
846
52
984
806
569
478
83


In [None]:
# drugs next
for dataset in dataset_names + broad_sanger_datasets:
    drug_file = glob.glob("/Users/imal967/pnnl/projects/coderdata/all_datasets/{}_drugs.*".format(dataset))
    if drug_file:
        drugs_df = pd.read_csv(drug_file[0], sep="\t" if drug_file[0].endswith(".tsv") else ",")
        print(drugs_df['improve_drug_id'].nunique())
    else:
        print(f"No drug file found for dataset: {dataset}")

No drug file found for dataset: hcmi
164
25
25
No drug file found for dataset: cptac
34
10
50
76
25


In [None]:
# now getting unique sample-drug pairs in the experimental data
for dataset in dataset_names + broad_sanger_datasets:
    exp_file = glob.glob("/Users/imal967/pnnl/projects/coderdata/all_datasets/{}_experiments.*".format(dataset))
    if exp_file:
        exp_df = pd.read_csv(exp_file[0], sep="\t" if exp_file[0].endswith(".tsv") else ",")
        print(exp_df[['improve_sample_id','improve_drug_id']].drop_duplicates().shape[0])
    else:
        print(f"No experimental data file found for dataset: {dataset}")

No experimental data file found for dataset: hcmi
23662
212
290
No experimental data file found for dataset: cptac
276
140
3300
4453
1766


In [13]:
# now doing rnaseq x drugs
# going to need rnaseq data and experimental data
for dataset in dataset_names + broad_sanger_datasets:
    exp_file = glob.glob("/Users/imal967/pnnl/projects/coderdata/all_datasets/{}_experiments.*".format(dataset))
    transcriptomics_file = glob.glob("/Users/imal967/pnnl/projects/coderdata/all_datasets/{}_transcriptomics.*".format(dataset))
    if transcriptomics_file:
        transcriptomics_df = pd.read_csv(transcriptomics_file[0], sep="\t" if transcriptomics_file[0].endswith(".tsv") else ",")
        all_transcriptomics_samples = transcriptomics_df['improve_sample_id'].unique() # unique samples with transcriptomics data
    if exp_file:
        exp_df = pd.read_csv(exp_file[0], sep="\t" if exp_file[0].endswith(".tsv") else ",")
        # now filter to only those samples with transcriptomics data
        exp_transcriptomics_df = exp_df[exp_df['improve_sample_id'].isin(all_transcriptomics_samples)]
        number_of_unique_sample_drug_pairs = exp_transcriptomics_df[['improve_sample_id','improve_drug_id']].drop_duplicates().shape[0]
        print(exp_transcriptomics_df[['improve_sample_id','improve_drug_id']].drop_duplicates())
        print(number_of_unique_sample_drug_pairs)
    else:
        print(f"No experimental data file found for dataset: {dataset}")

No experimental data file found for dataset: hcmi
       improve_sample_id improve_drug_id
0                   3909        SMI_3871
1                   3909        SMI_4862
2                   3909       SMI_11493
3                   3909       SMI_23048
4                   3909       SMI_51801
...                  ...             ...
18657               3918       SMI_44535
18658               3918       SMI_40153
18659               3918       SMI_13100
18660               3918       SMI_16810
18661               3918       SMI_35928

[3033 rows x 2 columns]
3033
     improve_sample_id improve_drug_id
0                 5373       SMI_50659
2                 5373       SMI_51826
4                 5373       SMI_24544
7                 5373        SMI_5642
10                5373       SMI_13747
..                 ...             ...
259               5382       SMI_39432
260               5382       SMI_22995
261               5382       SMI_43543
262               5382       SMI_56600

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [38]:
# rnaseq x mutations x drugs 
# loading in mutation data
for dataset in dataset_names:
    mutation_file = glob.glob("/Users/imal967/pnnl/projects/coderdata/all_datasets/{}_mutations.*".format(dataset))
    if mutation_file:
        mutations_df = pd.read_csv(mutation_file[0], sep="\t" if mutation_file[0].endswith(".tsv") else ",")
        all_mutation_samples = mutations_df['improve_sample_id'].unique() # unique samples with mutation data
    exp_file = glob.glob("/Users/imal967/pnnl/projects/coderdata/all_datasets/{}_experiments.*".format(dataset))
    transcriptomics_file = glob.glob("/Users/imal967/pnnl/projects/coderdata/all_datasets/{}_transcriptomics.*".format(dataset))
    if transcriptomics_file:
        transcriptomics_df = pd.read_csv(transcriptomics_file[0], sep="\t" if transcriptomics_file[0].endswith(".tsv") else ",")
        all_transcriptomics_samples = transcriptomics_df['improve_sample_id'].unique() # unique samples with transcriptomics data
    if exp_file:  
        exp_df = pd.read_csv(exp_file[0], sep="\t" if exp_file[0].endswith(".tsv") else ",")
        # now filter to only those samples with transcriptomics data and mutation data
        exp_transcriptomics_mutation_df = exp_df[exp_df['improve_sample_id'].isin(all_transcriptomics_samples) & exp_df['improve_sample_id'].isin(all_mutation_samples)]
        number_of_unique_sample_drug_pairs = exp_transcriptomics_mutation_df[['improve_sample_id','improve_drug_id']].drop_duplicates().shape[0]
        print(number_of_unique_sample_drug_pairs)
    else:
        print(f"No experimental data file found for dataset: {dataset}")

No experimental data file found for dataset: hcmi
2905
163
175
No experimental data file found for dataset: cptac
187
60
640
4453
1734


In [5]:
# adding to the dataframe
datset_stats = pd.DataFrame(columns=['dataset','num_samples','num_drugs','num_sample_drug_pairs','num_sample_drug_transcript_pairs','num_sample_drug_transcript_mutation_pairs','num_sample_drug_transcript_copynum_pairs','num_sample_drug_mutation_copynum_pairs'])
for dataset in dataset_names + broad_sanger_datasets:
    sample_file = glob.glob("/Users/imal967/pnnl/projects/coderdata/all_datasets/{}_samples.*".format(dataset))
    drug_file = glob.glob("/Users/imal967/pnnl/projects/coderdata/all_datasets/{}_drugs.*".format(dataset))
    exp_file = glob.glob("/Users/imal967/pnnl/projects/coderdata/all_datasets/{}_experiments.*".format(dataset))
    transcriptomics_file = glob.glob("/Users/imal967/pnnl/projects/coderdata/all_datasets/{}_transcriptomics.*".format(dataset))
    mutation_file = glob.glob("/Users/imal967/pnnl/projects/coderdata/all_datasets/{}_mutations.*".format(dataset))
    copy_number_file = glob.glob("/Users/imal967/pnnl/projects/coderdata/all_datasets/{}_copy_number.*".format(dataset)) 
    # getting number of samples
    if sample_file:
        samples_df = pd.read_csv(sample_file[0], sep="\t" if sample_file[0].endswith(".tsv") else ",")
        num_samples = samples_df['improve_sample_id'].nunique() # unique samples
    else:
        print(f"Missing files for dataset: {dataset}")
        num_samples = np.nan
    # getting number of drugs
    if drug_file:
        drugs_df = pd.read_csv(drug_file[0], sep="\t" if drug_file[0].endswith(".tsv") else ",")
        num_drugs = drugs_df['improve_drug_id'].nunique() # unique drugs
    else:
        print(f"Missing files for dataset: {dataset}")
        num_drugs = np.nan
    # getting number of unique sample-drug pairs
    if exp_file:
        exp_df = pd.read_csv(exp_file[0], sep="\t" if exp_file[0].endswith(".tsv") else ",")
        num_sample_drug_pairs = exp_df[['improve_sample_id','improve_drug_id']].drop_duplicates().shape[0]
    else:
        num_sample_drug_pairs = np.nan
    # getting number of unique sample-drug pairs with transcriptomics data
    if transcriptomics_file and exp_file:
        transcriptomics_df = pd.read_csv(transcriptomics_file[0], sep="\t" if transcriptomics_file[0].endswith(".tsv") else ",")
        all_transcriptomics_samples = transcriptomics_df['improve_sample_id'].unique() # unique samples with transcriptomics data
        # now filter to only those samples with transcriptomics data
        exp_transcriptomics_df = exp_df[exp_df['improve_sample_id'].isin(all_transcriptomics_samples)]
        num_sample_drug_transcript_pairs = exp_transcriptomics_df[['improve_sample_id','improve_drug_id']].drop_duplicates().shape[0]
    else:
        num_sample_drug_transcript_pairs = np.nan
    # unique sample-drug pairs with mutation data and transcriptomics data
    if transcriptomics_file and mutation_file and exp_file:
        mutations_df = pd.read_csv(mutation_file[0], sep="\t" if mutation_file[0].endswith(".tsv") else ",")
        all_mutation_samples = mutations_df['improve_sample_id'].unique() # unique samples with mutation data
        all_transcriptomics_samples = transcriptomics_df['improve_sample_id'].unique() # unique samples with transcriptomics data
        # now filter to only those samples with transcriptomics data and mutation data
        exp_transcriptomics_mutation_df = exp_df[exp_df['improve_sample_id'].isin(all_transcriptomics_samples) & exp_df['improve_sample_id'].isin(all_mutation_samples)]
        num_sample_drug_transcript_mutation_pairs = exp_transcriptomics_mutation_df[['improve_sample_id','improve_drug_id']].drop_duplicates().shape[0]
    else:
        num_sample_drug_transcript_mutation_pairs = np.nan
    # unique sample-drug pairs with copynum data and transcriptomics data
    if transcriptomics_file and copy_number_file and exp_file:
        copy_number_df = pd.read_csv(copy_number_file[0], sep="\t" if copy_number_file[0].endswith(".tsv") else ",")
        all_copy_number_samples = copy_number_df['improve_sample_id'].unique() # unique samples with copy number data
        all_transcriptomics_samples = transcriptomics_df['improve_sample_id'].unique() # unique samples with transcriptomics data
        # now filter to only those samples with transcriptomics data and copy number data
        exp_transcriptomics_copy_number_df = exp_df[exp_df['improve_sample_id'].isin(all_transcriptomics_samples) & exp_df['improve_sample_id'].isin(all_copy_number_samples)]
        num_sample_drug_transcript_copynum_pairs = exp_transcriptomics_copy_number_df[['improve_sample_id','improve_drug_id']].drop_duplicates().shape[0]
    else:
        num_sample_drug_transcript_copynum_pairs = np.nan
    # unique sample-drug pairs with mutation data and copynum data
    if mutation_file and copy_number_file and exp_file:
        all_mutation_samples = mutations_df['improve_sample_id'].unique()
        all_copy_number_samples = copy_number_df['improve_sample_id'].unique()
        exp_mutation_copy_number_df = exp_df[exp_df['improve_sample_id'].isin(all_mutation_samples) & exp_df['improve_sample_id'].isin(all_copy_number_samples)]
        num_sample_drug_mutation_copynum_pairs = exp_mutation_copy_number_df[['improve_sample_id','improve_drug_id']].drop_duplicates().shape[0]
    else:
        num_sample_drug_mutation_copynum_pairs = np.nan
    # adding to dataframe
    datset_stats = pd.concat([datset_stats, pd.DataFrame({'dataset':[dataset],'num_samples':[num_samples],'num_drugs':[num_drugs],'num_sample_drug_pairs':[num_sample_drug_pairs],'num_sample_drug_transcript_pairs':num_sample_drug_transcript_pairs,'num_sample_drug_transcript_mutation_pairs':num_sample_drug_transcript_mutation_pairs,'num_sample_drug_transcript_copynum_pairs':num_sample_drug_transcript_copynum_pairs,'num_sample_drug_mutation_copynum_pairs':num_sample_drug_mutation_copynum_pairs})], ignore_index=True)


Missing files for dataset: hcmi
Missing files for dataset: cptac


In [6]:
datset_stats

Unnamed: 0,dataset,num_samples,num_drugs,num_sample_drug_pairs,num_sample_drug_transcript_pairs,num_sample_drug_transcript_mutation_pairs,num_sample_drug_transcript_copynum_pairs,num_sample_drug_mutation_copynum_pairs
0,hcmi,886,,,,,,
1,beataml,1022,164.0,23662.0,3033.0,2905.0,,
2,mpnst,50,25.0,212.0,163.0,163.0,163.0,163.0
3,pancpdo,70,25.0,290.0,180.0,175.0,175.0,285.0
4,cptac,1139,,,,,,
5,sarcpdo,36,34.0,276.0,234.0,187.0,,
6,colorectal,61,10.0,140.0,60.0,60.0,60.0,140.0
7,bladderpdo,134,50.0,3300.0,840.0,640.0,640.0,3100.0
8,liver,62,76.0,4453.0,4453.0,4453.0,4453.0,4453.0
9,novartis,386,25.0,1766.0,1734.0,1734.0,1723.0,1723.0


In [7]:
datset_stats.to_csv("/Users/imal967/pnnl/projects/coderdata/dataset_summary_statistics.csv", index=False)

## Drug Curve Metrics Collected
This looks at the experiments results and shows how many drugs we have for each curve metric (found in the dose_response_metric column).

In [10]:
curve_metric_stats = pd.DataFrame(columns=['dataset','curve_metric','num_drugs'])
for dataset in dataset_names + broad_sanger_datasets:
    exp_file = glob.glob("/Users/imal967/pnnl/projects/coderdata/all_datasets/{}_experiments.*".format(dataset))
    if exp_file:
        exp_df = pd.read_csv(exp_file[0], sep="\t" if exp_file[0].endswith(".tsv") else ",")
        exp_df = exp_df[['improve_drug_id','dose_response_metric']].drop_duplicates()
        curve_metric_counts = exp_df.groupby('dose_response_metric')['improve_drug_id'].nunique().reset_index()
        curve_metric_counts['dataset'] = dataset
        curve_metric_stats = pd.concat([curve_metric_stats, curve_metric_counts.rename(columns={'dose_response_metric':'curve_metric','improve_drug_id':'num_drugs'})], ignore_index=True)
    else:
        print(f"No experimental data file found for dataset: {dataset}")


No experimental data file found for dataset: hcmi
No experimental data file found for dataset: cptac


In [11]:
curve_metric_stats

Unnamed: 0,dataset,curve_metric,num_drugs
0,beataml,aac,108
1,beataml,auc,108
2,beataml,dss,108
3,beataml,fit_auc,108
4,beataml,fit_ec50,108
...,...,...,...
140,nci60,fit_ec50se,54654
141,nci60,fit_einf,54654
142,nci60,fit_hs,54654
143,nci60,fit_ic50,54654


In [12]:
curve_metric_stats.to_csv("/Users/imal967/pnnl/projects/coderdata/dataset_curve_metric_summary.csv", index=False)