In [1]:
import pandas as pd
from druxai.utils.data import DrugResponseDataset

In [2]:
df = pd.read_csv("/Users/niklaskiermeyer/Desktop/Codespace/DruxAI/data/raw/sanger_files/sanger-dose-response.csv")
data_prism = DrugResponseDataset("/Users/niklaskiermeyer/Desktop/Codespace/DruxAI/data/preprocessed")

[34mINFO    [0m Loaded targets with shape: [1m([0m[1;36m556840[0m, [1;36m8[0m[1m)[0m                                                                    
[34mINFO    [0m Loaded molecular data with shape: [1m([0m[1;36m1479[0m, [1;36m19193[0m[1m)[0m                                                           


In [3]:
df = df[~df["auc"].isna()].reset_index(drop=True)
df

Unnamed: 0,DATASET,COSMIC_ID,DRUG_ID,MIN_CONC,MAX_CONC,RMSE_PUBLISHED,Z_SCORE_PUBLISHED,IC50_PUBLISHED,AUC_PUBLISHED,ARXSPAN_ID,DRUG_NAME,BROAD_ID,upper_limit,ec50,slope,lower_limit,auc,log2.ic50,mse,R2
0,GDSC1,683665,1,0.007813,2.0,0.022518,-0.192056,10.977393,0.982116,ACH-002270,ERLOTINIB,BRD-K70401845,0.992788,2.839376e+00,-5.670993,0.514389,0.990834,,0.000034,0.904675
1,GDSC1,684055,1,0.007813,2.0,0.031831,0.505823,23.133991,0.984820,ACH-002104,ERLOTINIB,BRD-K70401845,1.006405,2.864875e-02,-0.186377,0.990054,0.997138,,0.000057,0.028903
2,GDSC1,684062,1,0.007813,2.0,0.087010,-0.114395,11.926884,0.944463,ACH-002111,ERLOTINIB,BRD-K70401845,0.989580,7.580375e-02,-12.222777,0.894027,0.933185,,0.000623,0.777093
3,GDSC1,684072,1,0.007813,2.0,0.016288,-0.530674,7.645605,0.950763,ACH-000087,ERLOTINIB,BRD-K70401845,0.998887,9.671393e-01,-2.496776,0.816721,0.973032,,0.000018,0.993758
4,GDSC1,687799,1,0.007813,2.0,0.043985,-5.365666,0.043694,0.349947,ACH-000766,ERLOTINIB,BRD-K70401845,0.939255,9.351034e-02,-0.698966,0.047188,0.458337,-3.481481,0.000693,0.987013
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
327335,GDSC2,905951,2172,0.009766,10.0,0.143863,0.042524,25.410793,0.877741,ACH-000288,JQ1,"BRD-K54606188, NA",3.929045,3.287745e+11,-0.012190,-3.449953,0.856099,,0.006178,0.292447
327336,GDSC2,906862,2172,0.009766,10.0,0.088631,-2.223819,0.339325,0.510590,ACH-001065,JQ1,"BRD-K54606188, NA",0.935866,4.096968e-01,-2.014115,0.176515,0.585800,-1.073816,0.003198,0.971991
327337,GDSC2,907046,2172,0.009766,10.0,0.114748,-0.578942,7.780877,0.843211,ACH-000930,JQ1,"BRD-K54606188, NA",0.960799,3.695946e-01,-1.161533,0.687792,0.830671,,0.002792,0.795935
327338,GDSC2,749709,2172,0.009766,10.0,0.047625,1.642265,534.688321,0.983634,ACH-000859,JQ1,"BRD-K54606188, NA",0.977260,1.161541e+30,-0.031889,0.900234,0.969870,,0.000879,0.000540


In [4]:
selected_columns = df[["DATASET", "DRUG_ID", "IC50_PUBLISHED", "AUC_PUBLISHED", "ARXSPAN_ID", "DRUG_NAME"]]

In [5]:
# Find NAs of specific columns
selected_columns.isna().sum()

DATASET              0
DRUG_ID              0
IC50_PUBLISHED       0
AUC_PUBLISHED        0
ARXSPAN_ID        7227
DRUG_NAME            0
dtype: int64

In [6]:
# Drop rows with NAs
sanger_dataset = selected_columns.dropna().copy()

# Create an auc per drug column
sanger_dataset["auc_per_drug"] = sanger_dataset.groupby("DRUG_ID")["AUC_PUBLISHED"].transform(
    lambda x: (x - x.mean()) / x.std())

sanger_dataset = sanger_dataset.rename(columns={"DATASET": "Dataset", "DRUG_ID": "Drug_ID", "IC50_PUBLISHED": "IC50",
                                                "AUC_PUBLISHED": "auc", "ARXSPAN_ID": "cell_line", "DRUG_NAME": "DRUG"})

sanger_dataset = sanger_dataset.reindex(columns=["cell_line", "DRUG", "auc", "IC50", "auc_per_drug", "Drug_ID",
                                                 "Dataset"])

In [7]:
print(f"Matching cell lines in prism and sanger data: {len(set(sanger_dataset['cell_line']) &
      set(data_prism.targets['cell_line']))}")
print(f"There are a total of {sanger_dataset['cell_line'].nunique()} cell lines in the sanger dataset.")

Matching cell lines in prism and sanger data: 343
There are a total of 972 cell lines in the sanger dataset.


In [8]:
# Remove cell lines which are already in the prism dataset
sanger_dataset = sanger_dataset[~sanger_dataset["cell_line"].isin(data_prism.targets["cell_line"])]

In [22]:
# Check whether all cell lines have a match in the gene expression data
matches = len(set(data_prism.molecular_data.index.values) & set(sanger_dataset["cell_line"]))
print(f"There are a total of {matches} matches with gene expression data.")

# Only keep cell lines where we have a match
sanger_dataset = sanger_dataset[~sanger_dataset["cell_line"].isin(list(set(data_prism.molecular_data.index.values)))]

There are a total of 0 matches with gene expression data.


In [17]:
sanger_dataset

Unnamed: 0,cell_line,DRUG,auc,IC50,auc_per_drug,Drug_ID,Dataset
0,ACH-002270,ERLOTINIB,0.982116,10.977393,0.447919,1,GDSC1
1,ACH-002104,ERLOTINIB,0.984820,23.133991,0.476415,1,GDSC1
2,ACH-002111,ERLOTINIB,0.944463,11.926884,0.051114,1,GDSC1
9,ACH-001599,ERLOTINIB,0.995002,13.827025,0.583717,1,GDSC1
20,ACH-002212,ERLOTINIB,0.536042,0.156803,-4.253019,1,GDSC1
...,...,...,...,...,...,...,...
327256,ACH-001081,BMS-754807,0.878081,11.153741,1.088692,2171,GDSC2
327279,ACH-002233,BMS-754807,0.729563,2.345731,0.216928,2171,GDSC2
327302,ACH-002163,JQ1,0.924804,15.629044,0.585598,2172,GDSC2
327318,ACH-002179,JQ1,0.824933,4.599175,-0.170587,2172,GDSC2


In [None]:
# Find matches between gene Databank and cell line
data_prism