In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import StandardScaler

2023-07-29 21:15:30.689609: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [2]:
# read the file that has smiles and ids

In [3]:
pubchem_to_drugs_df = pd.read_csv('../data/GDSC/1.Drug_listMon Jun 24 09_00_55 2019.csv')

In [4]:
pubchem_to_drugs_df = pubchem_to_drugs_df[["drug_id", "PubCHEM"]]

In [5]:
pubchem_to_drugs_df.dtypes

drug_id     int64
PubCHEM    object
dtype: object

In [6]:
pubchem_to_drugs_df["PubCHEM"] = [val if str(val).isdigit() else np.nan for val in pubchem_to_drugs_df["PubCHEM"] ]

In [7]:
pubchem_to_drugs_df = pubchem_to_drugs_df.dropna()

In [8]:
pubchem_to_drugs_df["PubCHEM"] = pubchem_to_drugs_df["PubCHEM"].astype(np.int64)

In [9]:
pubchem_to_smiles = pd.read_csv('../data/223drugs_pubchem_smiles.txt', sep="\t", header=None)

In [10]:
pubchem_to_smiles.columns = ["PubCHEM", "Smiles"]

In [11]:
pubchem_to_smiles["PubCHEM"] = pubchem_to_smiles["PubCHEM"].astype(np.int64)

In [12]:
pubchem_drugs_smiles_df = pubchem_to_drugs_df.merge(pubchem_to_smiles, on = "PubCHEM")

In [13]:
def get_emb_models(dataset, id_col):
    # std = StandardScaler()
    unique_ids = dataset[id_col].values
    text_vec_layer = tf.keras.layers.TextVectorization(max_tokens = dataset.shape[0] + 2, 
                                                  standardize=None, split = None, 
                                                  output_mode = "int", 
                                                  vocabulary = unique_ids.tolist())
    weights = dataset.drop(id_col, 1).values
    padding_zeros = np.zeros((2, weights.shape[1]))
    weights = np.vstack((padding_zeros, weights))
    # std.fit(weights)
    # weights = std.transform(weights)
    emb_layer = tf.keras.layers.Embedding(dataset.shape[0] + 2, 
                                     weights.shape[1], 
                                     weights = [weights], 
                                     trainable = False)
    input_layer = tf.keras.layers.Input(shape = (1,), dtype = tf.string)
    vec_out = text_vec_layer(input_layer)
    emb_out = emb_layer(vec_out)
    flat_out = tf.keras.layers.Flatten()(emb_out)
    emb_model = tf.keras.models.Model(input_layer, flat_out)
    return emb_model

In [14]:
selected_info_common_cell_lines = "../data/CCLE/cellline_list.txt"
selected_info_common_genes = "../data/CCLE/gene_list.txt"

In [15]:
with open(selected_info_common_genes) as f:
    common_genes = [item.strip() for item in f.readlines()]

In [16]:
len(common_genes)

697

In [17]:
# read cancer cell line data
cancer_cell_copy_num_df = pd.read_csv('../data/CCLE/genomic_copynumber_561celllines_710genes_demap_features.csv')

In [18]:
cancer_cell_copy_num_df.shape

(561, 711)

In [19]:
cancer_cell_copy_num_df.head()

Unnamed: 0.1,Unnamed: 0,AKT3,ABI1,SH2B3,CDH10,CDH11,AKAP9,CDH17,LHFP,CDK4,...,CD79B,BCLAF1,KEAP1,SETDB1,SRGAP3,MAFB,GOLGA5,THRAP3,MED12,CDH1
0,ACH-000828,1.548332,1.040041,1.020747,1.374471,1.0422,0.848321,1.72935,0.847032,1.025471,...,1.121307,0.830675,0.879632,1.492768,1.053714,1.014371,1.213421,0.780331,1.01326,0.64187
1,ACH-000568,0.798791,1.072012,1.038193,1.061508,0.792855,1.061508,1.271905,0.779829,1.002602,...,1.316797,1.049884,0.821346,1.621861,1.291863,1.046269,0.799812,1.074006,1.00551,0.792855
2,ACH-000560,1.080266,0.864616,0.694564,1.053766,0.628903,1.423261,1.3695,0.632267,0.911728,...,1.100098,1.069393,1.232225,0.860476,1.063332,1.194632,1.050246,0.894051,0.873167,0.620049
3,ACH-000561,1.032649,1.408964,0.949651,1.293622,1.274622,1.503638,1.470401,1.198447,0.893277,...,1.063162,0.660578,0.892464,1.252069,0.650263,1.178741,1.052244,0.902153,0.681885,1.274622
4,ACH-000562,1.406262,1.005521,0.997044,1.015986,0.932915,1.007777,1.051055,0.822244,0.997044,...,1.192299,0.974316,0.997944,1.406262,0.810652,1.008153,0.943241,0.878888,0.608959,0.932915


In [20]:
# cancer_cell_copy_num_df[common_genes]

In [21]:
cancer_cell_copy_num_df = cancer_cell_copy_num_df.fillna(cancer_cell_copy_num_df.mean())

  cancer_cell_copy_num_df = cancer_cell_copy_num_df.fillna(cancer_cell_copy_num_df.mean())


In [22]:
int1 = cancer_cell_copy_num_df.columns[1:].tolist()

In [23]:
# read cancer cell line data
cancer_cell_gen_expr_df = pd.read_csv('../data/CCLE/genomic_expression_561celllines_697genes_demap_features.csv')

In [24]:
cancer_cell_gen_expr_df = cancer_cell_gen_expr_df.fillna(cancer_cell_gen_expr_df.mean())

  cancer_cell_gen_expr_df = cancer_cell_gen_expr_df.fillna(cancer_cell_gen_expr_df.mean())


In [25]:
# cancer_cell_gen_expr_df[common_genes]

In [26]:
int2 = cancer_cell_gen_expr_df.columns[1:].tolist()

In [27]:
len(set(int2).intersection(common_genes))

691

In [28]:
common_genes = set(common_genes).intersection(int1)

In [29]:
common_genes = set(common_genes).intersection(int2)

In [30]:
common_genes = np.sort(list(common_genes))

In [31]:
# len(common_genes)

In [32]:
cancer_cell_copy_num_df = pd.concat([cancer_cell_copy_num_df.iloc[:,0], cancer_cell_copy_num_df.iloc[:,1:][common_genes]], axis = 1)

In [33]:
# common_genes

In [34]:
# common_genes = list(common_genes)

In [35]:
import pickle

In [36]:
with open("..//data//common_genes.pickle", "wb") as f:
    pickle.dump(common_genes, f)

In [37]:
# common_genes

In [38]:
cancer_cell_gen_expr_df = pd.concat([cancer_cell_gen_expr_df.iloc[:,0], cancer_cell_gen_expr_df.iloc[:,1:][common_genes]], axis = 1)

In [39]:
cancer_cell_gen_expr_df

Unnamed: 0.1,Unnamed: 0,A1CF,ABI1,ABL1,ABL2,ACKR3,ACSL3,ACSL6,ACVR1,ACVR2A,...,ZEB1,ZFHX3,ZMYM3,ZNF331,ZNF384,ZNF429,ZNF479,ZNF521,ZNRF3,ZRSR2
0,ACH-000828,0.000000,4.583760,4.007196,3.171527,1.974529,7.175026,0.367371,2.769772,1.541019,...,0.150560,2.241840,3.666757,2.344828,5.384395,1.794936,0.0,0.000000,2.707083,3.864929
1,ACH-000568,0.028569,4.306700,3.017922,2.144046,2.490570,5.726014,0.250962,1.691534,1.526069,...,0.097611,1.555816,3.747387,4.724105,4.873321,2.560715,0.0,0.028569,1.713696,4.032982
2,ACH-000560,0.042644,4.297191,6.192589,3.291309,0.594549,4.752213,3.401903,3.408712,3.072106,...,2.659925,3.896272,5.205158,3.550901,5.859970,3.485427,0.0,3.674687,4.272770,3.361768
3,ACH-000561,0.042644,6.180705,3.792855,2.568032,6.689858,5.200850,0.250962,3.925050,2.003602,...,0.263034,3.065228,3.954196,2.910733,5.873567,2.495695,0.0,0.941106,2.283922,4.259272
4,ACH-000562,0.042644,5.023255,4.022368,4.302319,0.238787,5.917909,0.137504,3.089159,1.490570,...,0.918386,1.863938,3.400538,2.414136,5.354029,1.867896,0.0,0.124328,1.400538,3.275007
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
556,ACH-000242,0.028569,3.880686,4.171527,1.944858,7.292506,7.716922,0.163499,3.606442,2.952334,...,0.124328,2.641546,3.927896,5.389567,6.100767,3.053111,0.0,0.014355,3.260026,3.089159
557,ACH-000245,0.028569,5.501121,4.651913,1.761285,0.695994,5.201242,0.000000,0.124328,0.650765,...,3.280956,1.104337,4.279471,3.375735,6.071033,2.629939,0.0,0.286881,2.207893,3.643856
558,ACH-000244,0.000000,4.006298,5.726014,4.572283,0.238787,7.356584,0.275007,5.968091,2.726831,...,3.400538,3.343408,3.656496,3.378512,5.268659,3.416840,0.0,3.737687,2.241840,4.321207
559,ACH-000247,2.201634,4.719183,4.396434,1.839960,0.056584,5.860963,0.226509,2.839960,2.114367,...,1.257011,1.560715,3.206331,0.163499,5.208283,0.863938,0.0,0.000000,4.174726,3.583760


In [40]:
cancer_copy_number_model = get_emb_models(cancer_cell_copy_num_df, "Unnamed: 0")

2023-07-29 21:15:32.913473: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-07-29 21:15:33.417537: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 78910 MB memory:  -> device: 0, name: NVIDIA A100 80GB PCIe, pci bus id: 0000:31:00.0, compute capability: 8.0
  weights = dataset.drop(id_col, 1).values


In [41]:
cancer_copy_number_model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization (TextVec  (None, 1)                0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 1, 691)            389033    
                                                                 
 flatten (Flatten)           (None, 691)               0         
                                                                 
Total params: 389,033
Trainable params: 0
Non-trainable params: 389,033
_________________________________________________________________


In [42]:
cancer_copy_number_model.save("..//Models//cancer_copy_number_model_no_norm_common")





INFO:tensorflow:Assets written to: ..//Models//cancer_copy_number_model_no_norm_common/assets


INFO:tensorflow:Assets written to: ..//Models//cancer_copy_number_model_no_norm_common/assets


In [43]:
cancer_cell_gen_expr_df.shape

(561, 692)

In [44]:
cancer_cell_gen_expr_df.head()

Unnamed: 0.1,Unnamed: 0,A1CF,ABI1,ABL1,ABL2,ACKR3,ACSL3,ACSL6,ACVR1,ACVR2A,...,ZEB1,ZFHX3,ZMYM3,ZNF331,ZNF384,ZNF429,ZNF479,ZNF521,ZNRF3,ZRSR2
0,ACH-000828,0.0,4.58376,4.007196,3.171527,1.974529,7.175026,0.367371,2.769772,1.541019,...,0.15056,2.24184,3.666757,2.344828,5.384395,1.794936,0.0,0.0,2.707083,3.864929
1,ACH-000568,0.028569,4.3067,3.017922,2.144046,2.49057,5.726014,0.250962,1.691534,1.526069,...,0.097611,1.555816,3.747387,4.724105,4.873321,2.560715,0.0,0.028569,1.713696,4.032982
2,ACH-000560,0.042644,4.297191,6.192589,3.291309,0.594549,4.752213,3.401903,3.408712,3.072106,...,2.659925,3.896272,5.205158,3.550901,5.85997,3.485427,0.0,3.674687,4.27277,3.361768
3,ACH-000561,0.042644,6.180705,3.792855,2.568032,6.689858,5.20085,0.250962,3.92505,2.003602,...,0.263034,3.065228,3.954196,2.910733,5.873567,2.495695,0.0,0.941106,2.283922,4.259272
4,ACH-000562,0.042644,5.023255,4.022368,4.302319,0.238787,5.917909,0.137504,3.089159,1.49057,...,0.918386,1.863938,3.400538,2.414136,5.354029,1.867896,0.0,0.124328,1.400538,3.275007


In [45]:
cancer_cell_gen_expr_model = get_emb_models(cancer_cell_gen_expr_df, "Unnamed: 0")

  weights = dataset.drop(id_col, 1).values


In [46]:
cancer_cell_gen_expr_model.save("..//Models//cancer_cell_gen_expr_model_no_norm_common")





INFO:tensorflow:Assets written to: ..//Models//cancer_cell_gen_expr_model_no_norm_common/assets


INFO:tensorflow:Assets written to: ..//Models//cancer_cell_gen_expr_model_no_norm_common/assets


In [47]:
cancer_cell_gen_expr_model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 1)                0         
 ectorization)                                                   
                                                                 
 embedding_1 (Embedding)     (None, 1, 691)            389033    
                                                                 
 flatten_1 (Flatten)         (None, 691)               0         
                                                                 
Total params: 389,033
Trainable params: 0
Non-trainable params: 389,033
_________________________________________________________________


In [50]:
cancer_cell_copy_num_df.columns

Index(['Unnamed: 0', 'A1CF', 'ABI1', 'ABL1', 'ABL2', 'ACKR3', 'ACSL3', 'ACSL6',
       'ACVR1', 'ACVR2A',
       ...
       'ZEB1', 'ZFHX3', 'ZMYM3', 'ZNF331', 'ZNF384', 'ZNF429', 'ZNF479',
       'ZNF521', 'ZNRF3', 'ZRSR2'],
      dtype='object', length=692)

In [51]:
np.mean(cancer_cell_copy_num_df.columns == cancer_cell_copy_num_df.columns)

1.0