In [110]:
import time
import pickle
import torch
import copy
import torch.nn          as nn
import numpy             as np
import pandas            as pd
import matplotlib.pyplot as plt 
import seaborn           as sns

from rdkit import Chem
from typing           import List
from torch.utils.data import Dataset, DataLoader
from torch_geometric.loader import DataLoader as PyG_Dataloader

from config import (
    PATH_TO_FEATURES,
    PATH_TO_SAVED_DRUG_FEATURES,
    PATH_SUMMARY_DATASETS
)

torch.manual_seed(42)
sns.set_theme(style="white")

- [x] Create & save GDSC2 drug response matrix 
  - `<folder>/drug_response_matrix__gdsc2.pkl`
- [x] Create & save cell-line gene dataset as table/dictionary
  - `<folder>/cell_line_gene_matrix.pkl`
- [x] Create & save cell-line gene dataset as graph 
  - `<folder>/cell_line_graphs_dict.pkl`
- [x] Create & save drug dataset as table/dictionary 
  - as table: `<folder>/drug_smiles_fingerprints_matrix.pkl`
  - as dictionary: `<folder>/drug_smiles_fingerprints_dict.pkl`
- [x] Create & save drug dataset as graph 
  - `<folder>/drug_graphs_dict.pkl`

where `<folder> = datasets_for_model_building/summary_datasets`

---

## Pre-processing

In [3]:
# Reading the cell-line gene graphs.
with open(f'{PATH_TO_FEATURES}cl_graphs_as_dict.pkl', 'rb') as f:
    cl_graphs = pickle.load(f)

# Reading the drug response matrix.
with open(f'{PATH_TO_FEATURES}drugs_sparse_gdsc2.pkl', 'rb') as f: 
    drug_cl = pickle.load(f)  

In [4]:
print(f"Number of cell-lines/graphs: {len(list(cl_graphs.keys()))}")
print(cl_graphs['22RV1'])

Number of cell-lines/graphs: 983
Data(x=[858, 4], edge_index=[2, 83126])


In [5]:
print(f"Shape: {drug_cl.shape}")
print(f"Number of different drug id's   : {len(np.unique(drug_cl.DRUG_ID.values))}")
print(f"Number of different drug name's : {len(np.unique(drug_cl.DRUG_NAME.values))}")
drug_cl.head(10)

Shape: (135242, 5)
Number of different drug id's   : 198
Number of different drug name's : 192


Unnamed: 0,CELL_LINE_NAME,DRUG_ID,DRUG_NAME,DATASET,LN_IC50
3441054,22RV1,1003,Camptothecin,GDSC2,-3.142631
3459252,22RV1,1004,Vinblastine,GDSC2,-4.459259
3489119,22RV1,1005,Cisplatin,GDSC2,3.622285
3508920,22RV1,1006,Cytarabine,GDSC2,3.826935
3533420,22RV1,1007,Docetaxel,GDSC2,-3.835431
3551362,22RV1,1010,Gefitinib,GDSC2,4.032555
3571270,22RV1,1011,Navitoclax,GDSC2,3.963435
3589233,22RV1,1012,Vorinostat,GDSC2,0.846758
3606526,22RV1,1013,Nilotinib,GDSC2,3.93594
3627474,22RV1,1017,Olaparib,GDSC2,5.238895


In [6]:
drug_cl[['DRUG_ID', 'DRUG_NAME']].groupby(['DRUG_ID']).nunique().sort_values(['DRUG_NAME'], ascending=False).head(10)

Unnamed: 0_level_0,DRUG_NAME
DRUG_ID,Unnamed: 1_level_1
1003,1
1739,1
1799,1
1802,1
1804,1
1806,1
1807,1
1808,1
1809,1
1810,1


In [7]:
drug_cl[['DRUG_ID', 'DRUG_NAME']].groupby(['DRUG_NAME']).nunique().sort_values(['DRUG_ID'], ascending=False).head(10)
# TODO: only yake one DRUG_NAME, such that number of DRUG_NAME's equals number of DRUG_ID's and that there is a 1-to-1 mapping

Unnamed: 0_level_0,DRUG_ID
DRUG_NAME,Unnamed: 1_level_1
Dactinomycin,2
Fulvestrant,2
Oxaliplatin,2
Docetaxel,2
Ulixertinib,2
Uprosertib,2
PD173074,1
Olaparib,1
Osimertinib,1
P22077,1


- NOTE: We found that 6 `DRUG_NAME`'s occur for two `DRUG_ID`'s. 

The cell-line gene dataset is basically ready to go. It "only" needs to be transformed to a pytorch `Data` class. The graph per cell-line will be used as input to the GNN cell-line branch of the bi-modal model. However, the drug datasets is the drug response matrix. It doesn't contain the drug features. In the following subsection we will obtain the [SMILES fingerprints](https://jcheminf.biomedcentral.com/articles/10.1186/s13321-020-00445-4) for each unique `DRUG_ID`. This will later be used as the input to the drug branch of the bi-modal model.

### Transform drugs to SMILES fingerprints

In [8]:
with open(f'{PATH_TO_SAVED_DRUG_FEATURES}drug_name_fingerprints_dataframe.pkl', 'rb') as f:
    drug_name_smiles = pickle.load(f)
# drug_name_smiles.set_index(['drug_name'], inplace=True)
print(drug_name_smiles.shape)
# TODO: Note that yet there are some DRUG_NAME's which have >1 DRUG_ID
drug_name_smiles.head(5)

(367, 257)


Unnamed: 0,drug_name,0,1,2,3,4,5,6,7,8,...,246,247,248,249,250,251,252,253,254,255
0,(5Z)-7-Oxozeaenol,1,0,0,1,1,0,0,0,0,...,0,0,0,0,0,1,1,0,1,0
1,5-Fluorouracil,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,A-443654,0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
3,A-770041,1,1,0,0,0,1,0,0,0,...,0,0,0,1,0,1,0,0,0,0
4,A-83-01,0,0,0,1,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [9]:
all_non_uniqs = drug_cl[['DRUG_ID', 'DRUG_NAME']].groupby(['DRUG_NAME']).nunique().sort_values(['DRUG_ID'], ascending=False)
all_non_uniqs = all_non_uniqs[all_non_uniqs.DRUG_ID>1].reset_index()

smiles = []
for smile in list(drug_name_smiles.drug_name.values):
    if smile in list(all_non_uniqs.DRUG_NAME.values): 
        smiles.append(smile)
        print(f"{smile} is there and has not 1-to-1 mapping")

Docetaxel is there and has not 1-to-1 mapping
Ulixertinib is there and has not 1-to-1 mapping
Uprosertib is there and has not 1-to-1 mapping


In [10]:
drug_cl[drug_cl.DRUG_NAME=='Docetaxel']

Unnamed: 0,CELL_LINE_NAME,DRUG_ID,DRUG_NAME,DATASET,LN_IC50
3533420,22RV1,1007,Docetaxel,GDSC2,-3.835431
5096727,22RV1,1819,Docetaxel,GDSC2,-2.048442
3532924,23132-87,1007,Docetaxel,GDSC2,-5.663205
5096517,23132-87,1819,Docetaxel,GDSC2,-4.758792
3518116,42-MG-BA,1007,Docetaxel,GDSC2,-5.157606
...,...,...,...,...,...
3534191,YT,1007,Docetaxel,GDSC2,-4.989452
3515605,ZR-75-30,1007,Docetaxel,GDSC2,-1.275373
5096160,ZR-75-30,1819,Docetaxel,GDSC2,3.934073
3537528,huH-1,1007,Docetaxel,GDSC2,-2.747687


We will remove these for now. However, this needs to be tackled in the future.

In [11]:
drug_cl_v2 = drug_cl[~drug_cl.DRUG_NAME.isin(smiles)]
print(drug_cl_v2.shape)

(130826, 5)


In [12]:
drug_name_smiles_v2 = drug_name_smiles[~drug_name_smiles.drug_name.isin(smiles)]
print(drug_name_smiles_v2.shape)

(364, 257)


These are now the drug response matrix and the SMILES matrix without the non-1-to-1 mapped `DRUG_NAME` and `DRUG_ID`'s.

In [13]:
drug_name_smiles_v3 = pd.merge(left=drug_name_smiles_v2,
                                right=drug_cl_v2[['DRUG_ID', 'DRUG_NAME']],
                                how='left',
                                left_on=['drug_name'],
                                right_on=['DRUG_NAME'])
drug_name_smiles_v3.drop_duplicates(inplace=True)
drug_name_smiles_v3.drop(['DRUG_NAME'], axis=1, inplace=True)
drug_name_smiles_v3 = drug_name_smiles_v3[~drug_name_smiles_v3.DRUG_ID.isna()]
drug_name_smiles_v3['DRUG_ID'] = drug_name_smiles_v3.DRUG_ID.astype(np.int64)
drug_name_smiles_v3.rename(columns={'drug_name': 'DRUG_NAME'}, inplace=True)
drug_name_smiles_v3.insert(1, 'DRUG_ID', drug_name_smiles_v3.pop('DRUG_ID'))
print(drug_name_smiles_v3.shape)
drug_name_smiles_v3.head(5)

(152, 258)


Unnamed: 0,DRUG_NAME,DRUG_ID,0,1,2,3,4,5,6,7,...,246,247,248,249,250,251,252,253,254,255
1,5-Fluorouracil,1073,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
810,ABT737,1910,1,1,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,1
1562,AGI-5198,1913,0,1,1,0,1,0,0,0,...,0,0,0,0,1,1,0,0,0,0
2314,AGI-6780,1634,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,1
3044,AMG-319,2045,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [14]:
fingerprints = drug_name_smiles_v3.loc[:, ~drug_name_smiles_v3.columns.isin(['DRUG_NAME'])]
print(fingerprints.shape)
fingerprints.head(5)

(152, 257)


Unnamed: 0,DRUG_ID,0,1,2,3,4,5,6,7,8,...,246,247,248,249,250,251,252,253,254,255
1,1073,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
810,1910,1,1,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,1
1562,1913,0,1,1,0,1,0,0,0,0,...,0,0,0,0,1,1,0,0,0,0
2314,1634,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,1
3044,2045,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [15]:
# Select only the drugs in the drug response matrix which also have a SMILES string.
drug_cl_v3 = drug_cl_v2[drug_cl_v2.DRUG_ID.isin(list(fingerprints.DRUG_ID.values))]
print(drug_cl_v3.shape)
print(f"Number of unique cell-lines : {len(list(np.unique(drug_cl_v3.CELL_LINE_NAME.values)))}")
drug_cl_v3.head(5)

(101370, 5)
Number of unique cell-lines : 809


Unnamed: 0,CELL_LINE_NAME,DRUG_ID,DRUG_NAME,DATASET,LN_IC50
3441054,22RV1,1003,Camptothecin,GDSC2,-3.142631
3459252,22RV1,1004,Vinblastine,GDSC2,-4.459259
3508920,22RV1,1006,Cytarabine,GDSC2,3.826935
3551362,22RV1,1010,Gefitinib,GDSC2,4.032555
3571270,22RV1,1011,Navitoclax,GDSC2,3.963435


In [16]:
# Select only the cell-line graphs for which the cell-line is also in the drug-response matrix.
cl_graphs_v2 = copy.deepcopy(cl_graphs)
not_in = []
cls = list(np.unique(drug_cl_v3.CELL_LINE_NAME.values))
for cl in list(cl_graphs.keys()):
    if cl not in cls: 
        not_in.append(cl)
        cl_graphs_v2.pop(cl, None)

print(f"Number of cell-lines before    : {len(list(cl_graphs.keys()))}")
print(f"Number of not-found cell-lines : {len(not_in)}")
print(f"Number of cell-lines after     : {len(list(cl_graphs_v2.keys()))}")

Number of cell-lines before    : 983
Number of not-found cell-lines : 177
Number of cell-lines after     : 806


Since the cell-line graph dataset has less unique cell-lines then the drug-response matrix we need to remove the cell-line from the drug-response matrix which dont have a graph.

In [17]:
cls_with_no_graph = set(np.unique(drug_cl_v3.CELL_LINE_NAME.values)).difference(set(cl_graphs_v2.keys()))
drug_cl_v4 = drug_cl_v3[~drug_cl_v3.CELL_LINE_NAME.isin(cls_with_no_graph)]
print(drug_cl_v4.shape)
print(f"Number of unique cell-lines : {len(list(np.unique(drug_cl_v4.CELL_LINE_NAME.values)))}")
drug_cl_v4.head(5)


(100972, 5)
Number of unique cell-lines : 806


Unnamed: 0,CELL_LINE_NAME,DRUG_ID,DRUG_NAME,DATASET,LN_IC50
3441054,22RV1,1003,Camptothecin,GDSC2,-3.142631
3459252,22RV1,1004,Vinblastine,GDSC2,-4.459259
3508920,22RV1,1006,Cytarabine,GDSC2,3.826935
3551362,22RV1,1010,Gefitinib,GDSC2,4.032555
3571270,22RV1,1011,Navitoclax,GDSC2,3.963435


This is now the final drug dataset for the drug branch of the bi-modal network.

---

## Dataset summary

We got three datasets now: 
- Drug response matrix
- fingerprint matrix
- cell-line graph dictionary

## Save drug response matrix

In [18]:
# Drug response matrix holding the ln(IC50) values for each cell-line drug tuple.
drug_response_matrix = copy.deepcopy(drug_cl_v4)
print(drug_response_matrix.shape)
drug_response_matrix.sort_values(['DRUG_ID']).head(5)

(100972, 5)


Unnamed: 0,CELL_LINE_NAME,DRUG_ID,DRUG_NAME,DATASET,LN_IC50
3441054,22RV1,1003,Camptothecin,GDSC2,-3.142631
3441397,NCI-H1975,1003,Camptothecin,GDSC2,-1.579555
3424961,CL-34,1003,Camptothecin,GDSC2,-2.68195
3443434,LNZTA3WT4,1003,Camptothecin,GDSC2,-1.564607
3440494,A101D,1003,Camptothecin,GDSC2,-2.864103


In [23]:
SAVE = True 
if SAVE: 
    drug_response_matrix.to_pickle(f'{PATH_SUMMARY_DATASETS}drug_response_matrix__gdsc2.pkl')

## Save drug dataset as table & dictionary (SMILES fingerprints)

In [19]:
# The drug matrix holding the fingerprints for the drugs in the drug response matrix.
print(fingerprints.shape)
fingerprints.sort_values(['DRUG_ID']).head(5)

(152, 257)


Unnamed: 0,DRUG_ID,0,1,2,3,4,5,6,7,8,...,246,247,248,249,250,251,252,253,254,255
21273,1003,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
94088,1004,1,0,0,0,0,0,0,0,1,...,0,1,0,1,0,1,0,0,0,0
25052,1006,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
39836,1010,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
61065,1011,0,1,0,0,0,0,0,0,0,...,1,0,0,0,1,1,0,0,0,1


In [20]:
# Drugs as dictionary.
fingerprints_dict = fingerprints.set_index('DRUG_ID').T.to_dict('list')
print(len(fingerprints_dict.keys()))
print(len(fingerprints_dict[1003]))

152
256


In [228]:
SAVE = True 
if SAVE: 
    fingerprints.to_pickle(f'{PATH_SUMMARY_DATASETS}drug_smiles_fingerprints_matrix.pkl')
    with open(f'{PATH_SUMMARY_DATASETS}drug_smiles_fingerprints_dict.pkl', 'wb') as handle:
        pickle.dump(fingerprints_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)


## Save cell-lines as gene-interaction graphs

In [21]:
# The cell-line graphs holding for each cell-line in the drug-response-matrix the corresponding 
# graph with the cell-line level gene features.
cell_line_graphs = copy.deepcopy(cl_graphs_v2)
print(f"Number of cell-lines/graphs: {len(list(cell_line_graphs.keys()))}")
print(cell_line_graphs['22RV1'])

Number of cell-lines/graphs: 806
Data(x=[858, 4], edge_index=[2, 83126])


In [27]:
SAVE = True 
if SAVE: 
    with open(f'{PATH_SUMMARY_DATASETS}cell_line_graphs_dict.pkl', 'wb') as handle:
        pickle.dump(cell_line_graphs, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Create drug graph dataset

In [33]:
READ = True
if READ: 
    with open(f'{PATH_SUMMARY_DATASETS}drug_smiles_fingerprints.pkl', 'rb') as f: fingerprints = pickle.load(f)
    with open(f'{PATH_SUMMARY_DATASETS}drug_response_matrix__gdsc2.pkl', 'rb') as f: drug_response_matrix = pickle.load(f)
print(fingerprints.shape)
fingerprints.sort_values(['DRUG_ID']).head(5)

(152, 257)


Unnamed: 0,DRUG_ID,0,1,2,3,4,5,6,7,8,...,246,247,248,249,250,251,252,253,254,255
21273,1003,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
94088,1004,1,0,0,0,0,0,0,0,1,...,0,1,0,1,0,1,0,0,0,0
25052,1006,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
39836,1010,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
61065,1011,0,1,0,0,0,0,0,0,0,...,1,0,0,0,1,1,0,0,0,1


In [38]:
print(drug_response_matrix.shape)
print(len(np.unique(drug_response_matrix.DRUG_ID)))
print(len(np.unique(drug_response_matrix.DRUG_NAME)))
drug_response_matrix.head(5)

(100972, 5)
152
152


Unnamed: 0,CELL_LINE_NAME,DRUG_ID,DRUG_NAME,DATASET,LN_IC50
3441054,22RV1,1003,Camptothecin,GDSC2,-3.142631
3459252,22RV1,1004,Vinblastine,GDSC2,-4.459259
3508920,22RV1,1006,Cytarabine,GDSC2,3.826935
3551362,22RV1,1010,Gefitinib,GDSC2,4.032555
3571270,22RV1,1011,Navitoclax,GDSC2,3.963435


In [32]:
# Get the SMILES strings.
PATH_TO_GDSC_DATA = '../../datasets/gdsc/'
SMILES_FILE = 'GDSC_compounds_inchi_key_with_smiles.csv'

start = time.time()
smiles_data = pd.read_csv(f'{PATH_TO_GDSC_DATA}{SMILES_FILE}', sep=",", header=0)
print(f"File `{SMILES_FILE}` took {time.time()-start:.5f} seconds to import. \nShape: {smiles_data.shape}")
smiles_data.head(3)

File `GDSC_compounds_inchi_key_with_smiles.csv` took 0.01723 seconds to import. 
Shape: (425, 4)


Unnamed: 0.1,Unnamed: 0,drug_name,inchi_key,smiles
0,0,(5Z)-7-Oxozeaenol,NEQZWEXWOFPKOT-BYRRXHGESA-N,C[C@H]1CC=CC(=O)[C@H]([C@H](CC=Cc2cc(cc(c2C(=O...
1,1,5-Fluorouracil,GHASVSINZRGABV-UHFFFAOYSA-N,c1c(c(nc(n1)O)O)F
2,2,A-443654,YWTBGJGMTBHQTM-IBGZPJMESA-N,Cc1c2cc(ccc2n[nH]1)c1cc(cnc1)OC[C@H](Cc1c[nH]c...


In [97]:
# Get the SMILES strings for the drugs from the drug response matrix.
uniq_drug_names = list(np.unique(drug_response_matrix.DRUG_NAME))
smiles_subset = smiles_data[smiles_data.drug_name.isin(uniq_drug_names)][['drug_name', 'smiles']]
smiles_subset = smiles_subset[smiles_subset.smiles!='not_found']
smiles_subset.drop_duplicates(inplace=True)
print(smiles_subset.shape)
smiles_subset.head(10)

(152, 2)


Unnamed: 0,drug_name,smiles
1,5-Fluorouracil,c1c(c(nc(n1)O)O)F
6,AGI-6780,c1cc(cc(c1)N=C(Nc1cc(ccc1c1ccsc1)S(=O)(=O)NC1C...
17,AZD6482,Cc1cc([C@@H](C)Nc2ccccc2C(=O)O)c2nc(cc(=O)n2c1...
18,AZD7762,c1cc(cc(c1)F)c1cc(c(C(=O)N[C@H]2CCCNC2)s1)NC(=N)O
19,AZD8055,C[C@H]1COCCN1c1c2ccc(c3ccc(c(c3)CO)OC)nc2nc(n1...
20,Afatinib,CN(C)C/C=C/C(=Nc1cc2c(cc1O[C@H]1CCOC1)ncnc2Nc1...
22,Alisertib,COc1cccc(c1C1=NCc2cnc(Nc3ccc(c(c3)OC)C(=O)O)nc...
27,Axitinib,CN=C(c1ccccc1Sc1ccc2c(/C=C/c3ccccn3)[nH]nc2c1)O
30,BI-2536,CC[C@@H]1C(=O)N(C)c2cnc(Nc3ccc(cc3OC)C(=NC3CCN...
33,BMS-345541,Cc1ccc2c(c1)n1c(C)cnc1c(NCCN)n2


In [108]:
# Join the fingerprints to get also the DRUG_ID's.
smiles_v2 = pd.merge(left=smiles_subset, 
        right=drug_response_matrix[['DRUG_NAME', 'DRUG_ID']], 
        how='left', left_on=['drug_name'], right_on=['DRUG_NAME']).drop_duplicates()
smiles_v2.reset_index(inplace=True)        
smiles_v2 = smiles_v2[['DRUG_NAME', 'DRUG_ID', 'smiles']]
smiles_v2.rename(columns={'smiles': 'SMILES'}, inplace=True)
print(smiles_v2.shape)        
smiles_v2.head(10)

(152, 3)


Unnamed: 0,DRUG_NAME,DRUG_ID,SMILES
0,5-Fluorouracil,1073,c1c(c(nc(n1)O)O)F
1,AGI-6780,1634,c1cc(cc(c1)N=C(Nc1cc(ccc1c1ccsc1)S(=O)(=O)NC1C...
2,AZD6482,2169,Cc1cc([C@@H](C)Nc2ccccc2C(=O)O)c2nc(cc(=O)n2c1...
3,AZD7762,1022,c1cc(cc(c1)F)c1cc(c(C(=O)N[C@H]2CCCNC2)s1)NC(=N)O
4,AZD8055,1059,C[C@H]1COCCN1c1c2ccc(c3ccc(c(c3)CO)OC)nc2nc(n1...
5,Afatinib,1032,CN(C)C/C=C/C(=Nc1cc2c(cc1O[C@H]1CCOC1)ncnc2Nc1...
6,Alisertib,1051,COc1cccc(c1C1=NCc2cnc(Nc3ccc(c(c3)OC)C(=O)O)nc...
7,Axitinib,1021,CN=C(c1ccccc1Sc1ccc2c(/C=C/c3ccccn3)[nH]nc2c1)O
8,BI-2536,1086,CC[C@@H]1C(=O)N(C)c2cnc(Nc3ccc(cc3OC)C(=NC3CCN...
9,BMS-345541,1249,Cc1ccc2c(c1)n1c(C)cnc1c(NCCN)n2


### Transform SMILES to molecular graph

In [130]:
print(f'Plotting SMILES=`{smiles_v2.iloc[0].SMILES}`')
m = Chem.MolFromSmiles(smiles_v2.iloc[0].SMILES)
print(type(m))
m

Plotting SMILES=`c1c(c(nc(n1)O)O)F`
<class 'rdkit.Chem.rdchem.Mol'>


<rdkit.Chem.rdchem.Mol at 0x13ffaa680>

- method `torch_geometric.utils.from_utils` is taken from https://pytorch-geometric.readthedocs.io/en/latest/_modules/torch_geometric/utils/smiles.html

In [135]:
for i in range(smiles_v2.shape[0]):
    print(f'DRUG_NAME: {smiles_v2.iloc[i].DRUG_NAME}')
    print(f'DRUG_ID: {smiles_v2.iloc[i].DRUG_ID}')
    print(f'SMILES: {smiles_v2.iloc[i].SMILES}')
    a,b,c = smiles_v2.iloc[i]
    print(a)
    break 

DRUG_NAME: 5-Fluorouracil
DRUG_ID: 1073
SMILES: c1c(c(nc(n1)O)O)F
5-Fluorouracil


In [142]:
# ref: https://pytorch-geometric.readthedocs.io/en/latest/_modules/torch_geometric/utils/smiles.html 
from torch_geometric.utils import from_smiles 

# Create dictionary with DRUG_ID as key and smiles molecular graph as value.
smiles_graphs = {}
for i in range(smiles_v2.shape[0]):
    drug_name, drug_id, smiles = smiles_v2.iloc[i]
    smiles_graphs[drug_id] = from_smiles(smiles)

print(f'Number of keys/drugs : {len(smiles_graphs.keys())}')
# Print some examples.
for i in range(15):
    print(f'drug_id: {smiles_v2.iloc[i].DRUG_ID:5.0f} | drug_name: {smiles_v2.iloc[i].DRUG_NAME:15s} | graph: {smiles_graphs[smiles_v2.iloc[i].DRUG_ID]}')

Number of keys/drugs : 152
drug_id:  1073 | drug_name: 5-Fluorouracil  | graph: Data(x=[9, 9], edge_index=[2, 18], edge_attr=[18, 3], smiles='c1c(c(nc(n1)O)O)F')
drug_id:  1634 | drug_name: AGI-6780        | graph: Data(x=[32, 9], edge_index=[2, 70], edge_attr=[70, 3], smiles='c1cc(cc(c1)N=C(Nc1cc(ccc1c1ccsc1)S(=O)(=O)NC1CC1)O)C(F)(F)F')
drug_id:  2169 | drug_name: AZD6482         | graph: Data(x=[30, 9], edge_index=[2, 66], edge_attr=[66, 3], smiles='Cc1cc([C@@H](C)Nc2ccccc2C(=O)O)c2nc(cc(=O)n2c1)N1CCOCC1')
drug_id:  1022 | drug_name: AZD7762         | graph: Data(x=[25, 9], edge_index=[2, 54], edge_attr=[54, 3], smiles='c1cc(cc(c1)F)c1cc(c(C(=O)N[C@H]2CCCNC2)s1)NC(=N)O')
drug_id:  1059 | drug_name: AZD8055         | graph: Data(x=[34, 9], edge_index=[2, 76], edge_attr=[76, 3], smiles='C[C@H]1COCCN1c1c2ccc(c3ccc(c(c3)CO)OC)nc2nc(n1)N1CCOC[C@@H]1C')
drug_id:  1032 | drug_name: Afatinib        | graph: Data(x=[34, 9], edge_index=[2, 74], edge_attr=[74, 3], smiles='CN(C)C/C=C/C(=Nc1cc2c(

### Save drug graph dataset

In [143]:
SAVE = True 
if SAVE: 
    with open(f'{PATH_SUMMARY_DATASETS}drug_graphs_dict.pkl', 'wb') as handle:
        pickle.dump(smiles_graphs, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Create cell-line tabular dataset

- [ ] 4*858 columns, each time 858 genes with either one of the features. Thus we will have 3432 columns in the end for each cell-line.

### Read datasets

In [150]:
# The following feature tables have been created in notebook:
# `04_v2_mutations.ipynb`
READ = True
if READ:
    with open(f'{PATH_TO_FEATURES}gexpr_sparse.pkl', 'rb') as f: 
        gexpr_sparse = pickle.load(f)
        print(f"gexpr_sparse.shape : {gexpr_sparse.shape}")
    with open(f'{PATH_TO_FEATURES}cnvg_sparse.pkl', 'rb') as f: 
        cnvg_sparse = pickle.load(f)
        print(f"cnvg_sparse.shape : {cnvg_sparse.shape}")
    with open(f'{PATH_TO_FEATURES}cnvp_sparse.pkl', 'rb') as f: 
        cnvp_sparse = pickle.load(f)
        print(f"cnvp_sparse.shape : {cnvp_sparse.shape}")
    with open(f'{PATH_TO_FEATURES}mut_sparse.pkl', 'rb') as f: 
        mut_sparse = pickle.load(f)
        print(f"mut_sparse.shape : {mut_sparse.shape}")

gexpr_sparse.shape : (983, 859)
cnvg_sparse.shape : (983, 859)
cnvp_sparse.shape : (983, 859)
mut_sparse.shape : (983, 859)


In [154]:
gexpr_sparse.head(3)

Unnamed: 0,CELL_LINE_NAME,FBXL12,PIN1,PAK4,GNA15,ARPP19,EAPP,MOK,MTHFD2,TIPARP,...,PDHX,DFFB,FOSL1,ETS1,EBNA1BP2,MYL9,MLLT11,PFKL,FGFR4,SDHB
0,22RV1,7.023759,6.067534,4.31875,3.261427,6.297582,8.313991,5.514912,10.594112,5.222366,...,7.821536,3.601622,3.225596,3.651201,7.895763,3.953414,4.059382,4.376822,3.215209,9.267565
1,23132-87,6.714387,5.695096,4.536146,3.295886,7.021037,8.50008,4.862145,10.609245,6.528668,...,8.094289,3.596762,3.486299,3.127452,7.852436,3.869411,4.248318,4.989945,4.328643,9.51587
2,42-MG-BA,7.752402,5.475753,4.033714,3.176525,7.279671,8.013367,4.957332,11.266705,7.445954,...,7.984052,3.317746,5.106906,5.305024,6.508066,7.840349,8.632889,4.792137,3.078971,8.495921


In [155]:
cnvg_sparse.head(3)

Unnamed: 0,CELL_LINE_NAME,FBXL12,PIN1,PAK4,GNA15,ARPP19,EAPP,MOK,MTHFD2,TIPARP,...,PDHX,DFFB,FOSL1,ETS1,EBNA1BP2,MYL9,MLLT11,PFKL,FGFR4,SDHB
0,22RV1,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,1.0,...,-1.0,-1.0,0.0,0.0,0.0,0.0,1.0,-1.0,-1.0,-1.0
1,23132-87,-1.0,-1.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-1.0,-1.0,0.0,-1.0,1.0,1.0,-1.0,-1.0,-1.0
2,42-MG-BA,0.0,0.0,0.0,0.0,1.0,1.0,1.0,-1.0,-1.0,...,-1.0,1.0,1.0,1.0,1.0,1.0,-1.0,1.0,1.0,0.0


In [156]:
cnvp_sparse.head(3)

Unnamed: 0,CELL_LINE_NAME,FBXL12,PIN1,PAK4,GNA15,ARPP19,EAPP,MOK,MTHFD2,TIPARP,...,PDHX,DFFB,FOSL1,ETS1,EBNA1BP2,MYL9,MLLT11,PFKL,FGFR4,SDHB
0,22RV1,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,3.0,2.0,2.0,2.0
1,23132-87,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,3.0,3.0,2.0,2.0,2.0
2,42-MG-BA,4.0,4.0,4.0,4.0,5.0,6.0,5.0,3.0,3.0,...,3.0,5.0,5.0,5.0,5.0,6.0,3.0,5.0,5.0,4.0


In [204]:
mut_sparse.columns.name = ''
mut_sparse.head(3)

Unnamed: 0,CELL_LINE_NAME,FBXL12,PIN1,PAK4,GNA15,ARPP19,EAPP,MOK,MTHFD2,TIPARP,...,PDHX,DFFB,FOSL1,ETS1,EBNA1BP2,MYL9,MLLT11,PFKL,FGFR4,SDHB
0,22RV1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,23132-87,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,42-MG-BA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [160]:
feature_cls = list(np.unique(gexpr_sparse.CELL_LINE_NAME.values))
feature_genes = gexpr_sparse.columns[1:]
print(f"""
All the feature datasets have 
    # unique cell lines : {len(feature_cls)}
    # genes             : {len(feature_genes)}
""")


All the feature datasets have 
    # unique cell lines : 983
    # genes             : 858



In [162]:
print(cl_graphs_v2['TE-9'])
print(len(cl_graphs_v2.keys()))

Data(x=[858, 4], edge_index=[2, 83126])
806


- The genes are the same, while I removed some cell-lines in a pre-processing step.

### Pre-processing

In [166]:
cl_graph_cls = list(cl_graphs_v2.keys())
print(len(cl_graph_cls))
cl_graph_cls[:10]

806


['TE-9',
 'NALM-6',
 'SK-CO-1',
 'Hey',
 'IST-SL2',
 'OACM5-1',
 'NB10',
 'TT2609-C02',
 'NCI-H1648',
 'A253']

In [205]:
# Only take these cell-lines for the feature datasets 
GOAL_SHAPE = (806, 858+1)
gexpr_v2 = gexpr_sparse[gexpr_sparse.CELL_LINE_NAME.isin(cl_graph_cls)]
cnvg_v2 = cnvg_sparse[cnvg_sparse.CELL_LINE_NAME.isin(cl_graph_cls)]
cnvp_v2 = cnvp_sparse[cnvp_sparse.CELL_LINE_NAME.isin(cl_graph_cls)]
mut_v2 = mut_sparse[mut_sparse.CELL_LINE_NAME.isin(cl_graph_cls)]
assert gexpr_v2.shape == cnvg_v2.shape == cnvp_v2.shape == mut_v2.shape == GOAL_SHAPE

Now all feature datasets have the same shape as the cell-line graph and the same cell-lines and genes.

### Join all together

In [174]:
gexpr_v2.head(3)

Unnamed: 0,CELL_LINE_NAME,FBXL12,PIN1,PAK4,GNA15,ARPP19,EAPP,MOK,MTHFD2,TIPARP,...,PDHX,DFFB,FOSL1,ETS1,EBNA1BP2,MYL9,MLLT11,PFKL,FGFR4,SDHB
0,22RV1,7.023759,6.067534,4.31875,3.261427,6.297582,8.313991,5.514912,10.594112,5.222366,...,7.821536,3.601622,3.225596,3.651201,7.895763,3.953414,4.059382,4.376822,3.215209,9.267565
1,23132-87,6.714387,5.695096,4.536146,3.295886,7.021037,8.50008,4.862145,10.609245,6.528668,...,8.094289,3.596762,3.486299,3.127452,7.852436,3.869411,4.248318,4.989945,4.328643,9.51587
2,42-MG-BA,7.752402,5.475753,4.033714,3.176525,7.279671,8.013367,4.957332,11.266705,7.445954,...,7.984052,3.317746,5.106906,5.305024,6.508066,7.840349,8.632889,4.792137,3.078971,8.495921


In [175]:
cnvg_v2.head(3)

Unnamed: 0,CELL_LINE_NAME,FBXL12,PIN1,PAK4,GNA15,ARPP19,EAPP,MOK,MTHFD2,TIPARP,...,PDHX,DFFB,FOSL1,ETS1,EBNA1BP2,MYL9,MLLT11,PFKL,FGFR4,SDHB
0,22RV1,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,1.0,...,-1.0,-1.0,0.0,0.0,0.0,0.0,1.0,-1.0,-1.0,-1.0
1,23132-87,-1.0,-1.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-1.0,-1.0,0.0,-1.0,1.0,1.0,-1.0,-1.0,-1.0
2,42-MG-BA,0.0,0.0,0.0,0.0,1.0,1.0,1.0,-1.0,-1.0,...,-1.0,1.0,1.0,1.0,1.0,1.0,-1.0,1.0,1.0,0.0


In [210]:
merged = pd.merge(left=gexpr_v2, right=cnvg_v2, 
                  on=['CELL_LINE_NAME'],
                  suffixes=['_gexpr', '_cnvg'])
merged = pd.merge(left=merged, right=cnvp_v2, 
                  on=['CELL_LINE_NAME'],
                  suffixes=['', '_cnvp'])
merged = pd.merge(left=merged, right=mut_v2, 
                  on=['CELL_LINE_NAME'],
                  suffixes=['', '_mut'])
assert merged.shape == (806, 858*4 + 1)                                  
print(merged.shape)
merged.head(5)                                

(806, 3433)


Unnamed: 0,CELL_LINE_NAME,FBXL12_gexpr,PIN1_gexpr,PAK4_gexpr,GNA15_gexpr,ARPP19_gexpr,EAPP_gexpr,MOK_gexpr,MTHFD2_gexpr,TIPARP_gexpr,...,PDHX_mut,DFFB_mut,FOSL1_mut,ETS1_mut,EBNA1BP2_mut,MYL9_mut,MLLT11_mut,PFKL_mut,FGFR4_mut,SDHB_mut
0,22RV1,7.023759,6.067534,4.31875,3.261427,6.297582,8.313991,5.514912,10.594112,5.222366,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,23132-87,6.714387,5.695096,4.536146,3.295886,7.021037,8.50008,4.862145,10.609245,6.528668,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,42-MG-BA,7.752402,5.475753,4.033714,3.176525,7.279671,8.013367,4.957332,11.266705,7.445954,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5637,6.855088,5.980778,4.382524,6.086206,7.423409,8.12018,5.212472,10.329122,7.212325,...,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,639-V,7.436887,5.963376,4.125069,3.363524,6.665899,8.257927,7.972308,11.359373,5.708368,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [273]:
merged = pd.concat([gexpr_v2.set_index('CELL_LINE_NAME'), 
                    cnvg_v2.set_index('CELL_LINE_NAME'),
                    cnvp_v2.set_index('CELL_LINE_NAME'),
                    mut_v2.set_index('CELL_LINE_NAME')], 
                   keys=('gexpr', 'cnvg', 'cnvp', 'mut'), 
                   axis=1)
merged.columns = merged.columns.map(lambda x: f"{x[1]}_{x[0]}")
assert len([col for col in merged.columns if '_gexpr' in col]) == \
       len([col for col in merged.columns if '_gexpr' in col]) == \
       len([col for col in merged.columns if '_gexpr' in col]) == \
       len([col for col in merged.columns if '_gexpr' in col]) 
print(f"Number of genes:", len([col for col in merged.columns if '_gexpr' in col]))
merged.reset_index(inplace=True)                   
print(f"Number of cell-lines:", len(merged.CELL_LINE_NAME.unique()))
print(merged.shape)
merged.head(3)

Number of genes: 858
Number of cell-lines: 806
(806, 3433)


Unnamed: 0,CELL_LINE_NAME,FBXL12_gexpr,PIN1_gexpr,PAK4_gexpr,GNA15_gexpr,ARPP19_gexpr,EAPP_gexpr,MOK_gexpr,MTHFD2_gexpr,TIPARP_gexpr,...,PDHX_mut,DFFB_mut,FOSL1_mut,ETS1_mut,EBNA1BP2_mut,MYL9_mut,MLLT11_mut,PFKL_mut,FGFR4_mut,SDHB_mut
0,22RV1,7.023759,6.067534,4.31875,3.261427,6.297582,8.313991,5.514912,10.594112,5.222366,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,23132-87,6.714387,5.695096,4.536146,3.295886,7.021037,8.50008,4.862145,10.609245,6.528668,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,42-MG-BA,7.752402,5.475753,4.033714,3.176525,7.279671,8.013367,4.957332,11.266705,7.445954,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Save cell-line tabular dataset

In [274]:
SAVE = True 
if SAVE: 
    merged.to_pickle(f'{PATH_SUMMARY_DATASETS}cell_line_gene_matrix.pkl')