In [1]:
import numpy as np
import pandas as pd

In [2]:
molecular_data = pd.read_csv("../../data/preprocessed/rna_df.csv", index_col=0)
drugs = pd.read_csv("../../data/preprocessed/node2vec_embedding.csv", index_col=0)
print(molecular_data.shape, drugs.shape)

(1479, 19193) (500, 1149)


In [3]:
# Load secondary response dataset; Screen is the drug screen across different cell lines
secondary_response_df = pd.read_csv(
    "../../data/raw/prism_19q4_repurpose_primaryfiles/secondary-screen-dose-response-curve-parameters.csv"
)
secondary_response_df.head()

  secondary_response_df = pd.read_csv(


Unnamed: 0,broad_id,depmap_id,ccle_name,screen_id,upper_limit,lower_limit,slope,r2,auc,ec50,ic50,name,moa,target,disease.area,indication,smiles,phase,passed_str_profiling,row_name
0,BRD-K71847383-001-12-5,ACH-000879,MFE296_ENDOMETRIUM,HTS002,1,2.122352,-0.022826,-0.026964,1.677789,8415093.0,,cytarabine,ribonucleotide reductase inhibitor,"POLA1, POLB, POLD1, POLE",hematologic malignancy,"acute lymphoblastic leukemia (ALL), chronic ly...",Nc1ccn([C@@H]2O[C@H](CO)[C@@H](O)[C@H]2O)c(=O)...,Launched,True,ACH-000879
1,BRD-K71847383-001-12-5,ACH-000320,PSN1_PANCREAS,HTS002,1,1.325174,-0.237504,-0.147274,1.2403,9.643742,,cytarabine,ribonucleotide reductase inhibitor,"POLA1, POLB, POLD1, POLE",hematologic malignancy,"acute lymphoblastic leukemia (ALL), chronic ly...",Nc1ccn([C@@H]2O[C@H](CO)[C@@H](O)[C@H]2O)c(=O)...,Launched,True,ACH-000320
2,BRD-K71847383-001-12-5,ACH-001145,OC316_OVARY,HTS002,1,2.08935,-0.302937,0.193893,1.472333,0.02776687,,cytarabine,ribonucleotide reductase inhibitor,"POLA1, POLB, POLD1, POLE",hematologic malignancy,"acute lymphoblastic leukemia (ALL), chronic ly...",Nc1ccn([C@@H]2O[C@H](CO)[C@@H](O)[C@H]2O)c(=O)...,Launched,True,ACH-001145
3,BRD-K71847383-001-12-5,ACH-000873,KYSE270_OESOPHAGUS,HTS002,1,1.31182,-0.209393,-0.00546,1.20716,2.654701,,cytarabine,ribonucleotide reductase inhibitor,"POLA1, POLB, POLD1, POLE",hematologic malignancy,"acute lymphoblastic leukemia (ALL), chronic ly...",Nc1ccn([C@@H]2O[C@H](CO)[C@@H](O)[C@H]2O)c(=O)...,Launched,True,ACH-000873
4,BRD-K71847383-001-12-5,ACH-000855,KYSE150_OESOPHAGUS,HTS002,1,1.369799,-0.27753,0.132818,1.229332,0.5889041,,cytarabine,ribonucleotide reductase inhibitor,"POLA1, POLB, POLD1, POLE",hematologic malignancy,"acute lymphoblastic leukemia (ALL), chronic ly...",Nc1ccn([C@@H]2O[C@H](CO)[C@@H](O)[C@H]2O)c(=O)...,Launched,True,ACH-000855


In [4]:
# Get rid of comma after every smiles code since it is not needed and will cause an error
# Somehow every smile is twice in there smile_code, smile_code instead of just smile_code
secondary_response_df["smiles"] = secondary_response_df["smiles"].apply(lambda x: x.split(",")[0])
auc_labels = secondary_response_df[["depmap_id", "name", "auc", "smiles"]]

auc_labels = auc_labels.copy()
auc_labels.rename(columns={"depmap_id": "cell_line", "name": "DRUG"}, inplace=True)
auc_labels["DRUG"] = auc_labels["DRUG"].str.upper()

# IC50_labels = IC50_labels.merge(labels_filtered, how = 'inner')
# auc_labels = auc_labels.merge(labels_filtered, how = 'inner')

# Data Cleaning Steps
auc_labels = auc_labels[["cell_line", "DRUG", "auc", "smiles"]].drop_duplicates()
auc_labels = auc_labels[~np.isnan(auc_labels["auc"])].reset_index(drop=True)
auc_labels = auc_labels[~np.isinf(auc_labels["auc"])].reset_index(drop=True)

# Only keep cell lines in secondary screening where we have data for
auc_labels = auc_labels[np.isin(auc_labels["cell_line"], molecular_data.index)]
auc_labels = auc_labels[np.isin(auc_labels["DRUG"], drugs.columns)]

# Calculate additional statistics
auc_labels["logauc"] = np.log(auc_labels["auc"])
auc_labels["auc_per_drug"] = auc_labels.groupby("DRUG")["logauc"].transform(lambda x: (x - x.mean()) / x.std())
auc_labels["means"] = auc_labels.groupby("DRUG")["logauc"].transform(lambda x: x.mean())
auc_labels["stds"] = auc_labels.groupby("DRUG")["logauc"].transform(lambda x: x.std())
auc_labels["range"] = auc_labels.groupby("DRUG")["logauc"].transform(lambda x: x.max() - x.min())

auc_labels.to_csv("../../data/preprocessed/auc_secondary_screen_prediction_targets.csv")
auc_labels

Unnamed: 0,cell_line,DRUG,auc,smiles,logauc,auc_per_drug,means,stds,range
0,ACH-000879,CYTARABINE,1.677789,Nc1ccn([C@@H]2O[C@H](CO)[C@@H](O)[C@H]2O)c(=O)n1,0.517477,2.367402,0.228684,0.121987,0.782691
1,ACH-000320,CYTARABINE,1.240300,Nc1ccn([C@@H]2O[C@H](CO)[C@@H](O)[C@H]2O)c(=O)n1,0.215353,-0.109281,0.228684,0.121987,0.782691
2,ACH-001145,CYTARABINE,1.472333,Nc1ccn([C@@H]2O[C@H](CO)[C@@H](O)[C@H]2O)c(=O)n1,0.386848,1.296560,0.228684,0.121987,0.782691
3,ACH-000873,CYTARABINE,1.207160,Nc1ccn([C@@H]2O[C@H](CO)[C@@H](O)[C@H]2O)c(=O)n1,0.188270,-0.331294,0.228684,0.121987,0.782691
4,ACH-000855,CYTARABINE,1.229332,Nc1ccn([C@@H]2O[C@H](CO)[C@@H](O)[C@H]2O)c(=O)n1,0.206471,-0.182097,0.228684,0.121987,0.782691
...,...,...,...,...,...,...,...,...,...
700916,ACH-001321,BREQUINAR,0.987409,Cc1c(nc2ccc(F)cc2c1C(O)=O)-c1ccc(cc1)-c1ccccc1F,-0.012671,0.979322,-0.184222,0.175173,1.078361
700917,ACH-001321,AZD8931,1.000000,CNC(=O)CN1CCC(CC1)Oc1cc2c(Nc3cccc(Cl)c3F)ncnc2...,0.000000,0.879937,-0.199376,0.226580,2.297688
700918,ACH-001321,AZD2014,1.000000,CNC(=O)c1cccc(c1)-c1ccc2c(nc(nc2n1)N1CCOC[C@@H...,0.000000,1.643680,-0.279217,0.169873,1.025072
700919,ACH-001321,MOTESANIB,0.888088,CC1(C)CNc2cc(NC(=O)c3cccnc3NCc3ccncc3)ccc12,-0.118685,-0.367226,-0.084827,0.092199,0.957449


In [5]:
# Duplicate check

check_duplicates = auc_labels[["cell_line", "DRUG"]]
# check_duplicates.drop_duplicates()

mask = auc_labels[["cell_line", "DRUG"]]

filtered_df = mask[mask["cell_line"] == "ACH-000879"]
print("Rows where 'cell_line' feature contains 'ACH-000879':")
print(filtered_df[filtered_df.duplicated(keep=False)])  # keep=False marks all duplicates as True in the mask

Rows where 'cell_line' feature contains 'ACH-000879':
         cell_line          DRUG
1759    ACH-000879     ADAPALENE
5926    ACH-000879    BELINOSTAT
18608   ACH-000879    BORTEZOMIB
19522   ACH-000879  ESTRAMUSTINE
20893   ACH-000879    DISULFIRAM
...            ...           ...
689236  ACH-000879      IMATINIB
689237  ACH-000879   TALAZOPARIB
689238  ACH-000879       AZD2014
689239  ACH-000879     MOTESANIB
689240  ACH-000879     LINIFANIB

[219 rows x 2 columns]


In [6]:
check_duplicates.drop_duplicates(inplace=True)
print("If we drop duplicates of cell line and Drug (auc_per_drug is different -> different experiments)",
f"we have {check_duplicates.shape[0]} rows.")
check_duplicates

If we drop duplicates of cell line and Drug (auc_per_drug is different -> different experiments) we have 493577 rows.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  check_duplicates.drop_duplicates(inplace=True)


Unnamed: 0,cell_line,DRUG
0,ACH-000879,CYTARABINE
1,ACH-000320,CYTARABINE
2,ACH-001145,CYTARABINE
3,ACH-000873,CYTARABINE
4,ACH-000855,CYTARABINE
...,...,...
700886,ACH-001321,CINACALCET
700896,ACH-001321,INC-280
700904,ACH-001321,DOVITINIB
700906,ACH-001321,LINSITINIB
