In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import StandardScaler

2023-07-29 08:34:07.025386: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [2]:
# read the file that has smiles and ids

In [3]:
pubchem_to_drugs_df = pd.read_csv('../data/GDSC/1.Drug_listMon Jun 24 09_00_55 2019.csv')

In [4]:
pubchem_to_drugs_df = pubchem_to_drugs_df[["drug_id", "PubCHEM"]]

In [5]:
pubchem_to_drugs_df.dtypes

drug_id     int64
PubCHEM    object
dtype: object

In [6]:
pubchem_to_drugs_df["PubCHEM"] = [val if str(val).isdigit() else np.nan for val in pubchem_to_drugs_df["PubCHEM"] ]

In [7]:
pubchem_to_drugs_df = pubchem_to_drugs_df.dropna()

In [8]:
pubchem_to_drugs_df["PubCHEM"] = pubchem_to_drugs_df["PubCHEM"].astype(np.int64)

In [9]:
pubchem_to_smiles = pd.read_csv('../data/223drugs_pubchem_smiles.txt', sep="\t", header=None)

In [10]:
pubchem_to_smiles.columns = ["PubCHEM", "Smiles"]

In [11]:
pubchem_to_smiles["PubCHEM"] = pubchem_to_smiles["PubCHEM"].astype(np.int64)

In [12]:
pubchem_drugs_smiles_df = pubchem_to_drugs_df.merge(pubchem_to_smiles, on = "PubCHEM")

In [13]:
def get_emb_models(dataset, id_col):
    # std = StandardScaler()
    unique_ids = dataset[id_col].values
    text_vec_layer = tf.keras.layers.TextVectorization(max_tokens = dataset.shape[0] + 2, 
                                                  standardize=None, split = None, 
                                                  output_mode = "int", 
                                                  vocabulary = unique_ids.tolist())
    weights = dataset.drop(id_col, 1).values
    padding_zeros = np.zeros((2, weights.shape[1]))
    weights = np.vstack((padding_zeros, weights))
    # std.fit(weights)
    # weights = std.transform(weights)
    emb_layer = tf.keras.layers.Embedding(dataset.shape[0] + 2, 
                                     weights.shape[1], 
                                     weights = [weights], 
                                     trainable = False)
    input_layer = tf.keras.layers.Input(shape = (1,), dtype = tf.string)
    vec_out = text_vec_layer(input_layer)
    emb_out = emb_layer(vec_out)
    flat_out = tf.keras.layers.Flatten()(emb_out)
    emb_model = tf.keras.models.Model(input_layer, flat_out)
    return emb_model

In [14]:
selected_info_common_cell_lines = "../data/CCLE/cellline_list.txt"
selected_info_common_genes = "../data/CCLE/gene_list.txt"

In [15]:
with open(selected_info_common_genes) as f:
    common_genes = [item.strip() for item in f.readlines()]

In [16]:
len(common_genes)

697

In [17]:
# read cancer cell line data
cancer_cell_copy_num_df = pd.read_csv('../data/CCLE/genomic_copynumber_561celllines_710genes_demap_features.csv')

In [18]:
cancer_cell_copy_num_df.shape

(561, 711)

In [19]:
cancer_cell_copy_num_df.head()

Unnamed: 0.1,Unnamed: 0,AKT3,ABI1,SH2B3,CDH10,CDH11,AKAP9,CDH17,LHFP,CDK4,...,CD79B,BCLAF1,KEAP1,SETDB1,SRGAP3,MAFB,GOLGA5,THRAP3,MED12,CDH1
0,ACH-000828,1.548332,1.040041,1.020747,1.374471,1.0422,0.848321,1.72935,0.847032,1.025471,...,1.121307,0.830675,0.879632,1.492768,1.053714,1.014371,1.213421,0.780331,1.01326,0.64187
1,ACH-000568,0.798791,1.072012,1.038193,1.061508,0.792855,1.061508,1.271905,0.779829,1.002602,...,1.316797,1.049884,0.821346,1.621861,1.291863,1.046269,0.799812,1.074006,1.00551,0.792855
2,ACH-000560,1.080266,0.864616,0.694564,1.053766,0.628903,1.423261,1.3695,0.632267,0.911728,...,1.100098,1.069393,1.232225,0.860476,1.063332,1.194632,1.050246,0.894051,0.873167,0.620049
3,ACH-000561,1.032649,1.408964,0.949651,1.293622,1.274622,1.503638,1.470401,1.198447,0.893277,...,1.063162,0.660578,0.892464,1.252069,0.650263,1.178741,1.052244,0.902153,0.681885,1.274622
4,ACH-000562,1.406262,1.005521,0.997044,1.015986,0.932915,1.007777,1.051055,0.822244,0.997044,...,1.192299,0.974316,0.997944,1.406262,0.810652,1.008153,0.943241,0.878888,0.608959,0.932915


In [20]:
# cancer_cell_copy_num_df[common_genes]

In [21]:
cancer_cell_copy_num_df = cancer_cell_copy_num_df.fillna(cancer_cell_copy_num_df.mean())

  cancer_cell_copy_num_df = cancer_cell_copy_num_df.fillna(cancer_cell_copy_num_df.mean())


In [22]:
int1 = cancer_cell_copy_num_df.columns[1:].tolist()

In [23]:
# read cancer cell line data
cancer_cell_gen_expr_df = pd.read_csv('../data/CCLE/genomic_expression_561celllines_697genes_demap_features.csv')

In [24]:
cancer_cell_gen_expr_df = cancer_cell_gen_expr_df.fillna(cancer_cell_gen_expr_df.mean())

  cancer_cell_gen_expr_df = cancer_cell_gen_expr_df.fillna(cancer_cell_gen_expr_df.mean())


In [25]:
# cancer_cell_gen_expr_df[common_genes]

In [26]:
int2 = cancer_cell_gen_expr_df.columns[1:].tolist()

In [27]:
len(set(int2).intersection(common_genes))

691

In [28]:
common_genes = set(common_genes).intersection(int1)

In [29]:
common_genes = set(common_genes).intersection(int2)

In [30]:
# len(common_genes)

In [31]:
cancer_cell_copy_num_df = pd.concat([cancer_cell_copy_num_df.iloc[:,0], cancer_cell_copy_num_df.iloc[:,1:][common_genes]], axis = 1)

  cancer_cell_copy_num_df = pd.concat([cancer_cell_copy_num_df.iloc[:,0], cancer_cell_copy_num_df.iloc[:,1:][common_genes]], axis = 1)


In [32]:
# common_genes

In [33]:
common_genes = list(common_genes)

In [34]:
import pickle

In [31]:
with open("..//data//common_genes.pickle", "wb") as f:
    pickle.dump(common_genes, f)

In [35]:
cancer_cell_gen_expr_df = pd.concat([cancer_cell_gen_expr_df.iloc[:,0], cancer_cell_gen_expr_df.iloc[:,1:][common_genes]], axis = 1)

In [36]:
cancer_cell_gen_expr_df

Unnamed: 0.1,Unnamed: 0,ERC1,FNBP1,ANK1,PRDM1,NFIB,FANCF,BCL11A,HOXD13,NKX2-1,...,UBR5,LRIG3,NPM1,DNMT3A,IKZF1,AKAP9,EML4,TPR,LSM14A,BCL7A
0,ACH-000828,3.901108,3.831877,0.137504,0.516015,2.244887,2.904966,0.014355,0.014355,0.028569,...,6.458940,2.531069,10.657497,2.560715,0.056584,6.193180,4.037382,5.594549,6.501598,3.775051
1,ACH-000568,1.356144,3.982765,0.757023,0.056584,3.099295,2.307429,0.084064,0.124328,0.475085,...,6.064797,1.298658,10.461070,2.780310,0.443607,3.399171,3.260026,5.823749,5.811214,2.976364
2,ACH-000560,4.289834,4.443607,0.464668,2.397803,4.827819,3.303050,3.767655,0.333424,0.056584,...,6.392489,1.851999,10.301496,6.112700,0.042644,5.801676,4.809414,5.948134,7.581351,4.554589
3,ACH-000561,4.338424,3.522307,0.650765,2.895303,2.327687,2.867896,4.192194,1.887525,0.189034,...,6.527008,4.781360,10.266787,3.824768,0.389567,5.135042,4.669027,5.290572,6.533719,3.823749
4,ACH-000562,2.469886,4.844486,0.097611,1.739848,3.503349,2.397803,0.594549,0.000000,6.763677,...,5.352970,2.147307,11.782998,3.744161,0.028569,4.167519,4.751678,6.173327,5.912171,2.330558
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
556,ACH-000242,3.545968,3.679199,0.333424,0.163499,2.364572,3.182692,0.263034,0.669027,0.584963,...,6.055933,4.124328,11.473807,2.266037,0.163499,4.525443,5.170726,5.466627,6.445429,4.491212
557,ACH-000245,3.612352,2.613532,0.310340,0.992768,1.195348,2.767655,4.475085,0.000000,0.042644,...,6.320485,0.042644,11.678380,1.310340,6.580598,4.659925,4.429616,5.295723,7.263222,6.226894
558,ACH-000244,4.215679,4.522935,2.682573,1.521051,1.384050,2.438293,0.400538,2.405992,0.000000,...,6.060696,4.517906,10.685800,3.280956,0.070389,5.273516,5.327687,5.641546,6.511595,2.347666
559,ACH-000247,2.477677,0.226509,5.984134,1.937344,0.263034,3.246408,0.014355,0.028569,0.014355,...,5.666757,1.367371,10.981282,4.303781,0.070389,4.440952,4.917909,5.399171,5.928370,3.473787


In [37]:
cancer_copy_number_model = get_emb_models(cancer_cell_copy_num_df, "Unnamed: 0")

2023-07-29 08:34:14.713770: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :
2023-07-29 08:34:14.713804: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2023-07-29 08:34:14.713831: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (c2518.swan.hcc.unl.edu): /proc/driver/nvidia/version does not exist
2023-07-29 08:34:14.715040: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
  weights = dataset.drop(id_col, 1).values

In [38]:
cancer_copy_number_model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization (TextVec  (None, 1)                0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 1, 691)            389033    
                                                                 
 flatten (Flatten)           (None, 691)               0         
                                                                 
Total params: 389,033
Trainable params: 0
Non-trainable params: 389,033
_________________________________________________________________


In [36]:
cancer_copy_number_model.save("..//Models//cancer_copy_number_model_no_norm_common")





INFO:tensorflow:Assets written to: ..//Models//cancer_copy_number_model_no_norm_common/assets


INFO:tensorflow:Assets written to: ..//Models//cancer_copy_number_model_no_norm_common/assets


In [37]:
cancer_cell_gen_expr_df.shape

(561, 692)

In [38]:
cancer_cell_gen_expr_df.head()

Unnamed: 0.1,Unnamed: 0,PTCH1,LATS1,ERBB2,BCL10,NCOR2,NRAS,CCNE1,ACKR3,PRDM2,...,BRCA2,POLG,PHOX2B,GOPC,PRPF40B,CIC,HOXC11,EIF1AX,USP8,ALK
0,ACH-000828,0.695994,2.713696,11.456611,3.349082,6.412104,3.106013,3.723559,1.974529,3.499527,...,2.211012,5.301222,0.0,3.612352,4.625855,4.649041,4.373648,5.694602,4.767125,0.0
1,ACH-000568,2.169925,2.776104,10.959857,2.049631,5.475085,1.981853,3.661065,2.49057,2.432959,...,2.056584,4.800641,0.0,3.764474,5.007644,4.68818,2.568032,5.239169,3.482848,0.070389
2,ACH-000560,3.884598,3.849999,3.300124,1.799087,5.960929,4.488001,4.67638,0.594549,3.799087,...,2.516015,5.763412,0.014355,5.245267,3.960697,5.780048,1.914565,6.024586,4.878725,0.070389
3,ACH-000561,4.297925,2.877744,5.804776,2.65306,5.077243,7.098769,2.952334,6.689858,4.009885,...,2.430285,6.017031,0.0,3.598127,3.957915,3.948601,1.485427,6.373474,5.31687,0.422233
4,ACH-000562,1.201634,2.65306,5.973611,4.74685,5.421223,3.98823,4.313246,0.238787,3.147307,...,2.599318,5.459759,0.014355,3.761285,3.658783,4.100978,0.0,5.681449,4.18428,0.275007


In [39]:
cancer_cell_gen_expr_model = get_emb_models(cancer_cell_gen_expr_df, "Unnamed: 0")

  weights = dataset.drop(id_col, 1).values


In [40]:
cancer_cell_gen_expr_model.save("..//Models//cancer_cell_gen_expr_model_no_norm_common")





INFO:tensorflow:Assets written to: ..//Models//cancer_cell_gen_expr_model_no_norm_common/assets


INFO:tensorflow:Assets written to: ..//Models//cancer_cell_gen_expr_model_no_norm_common/assets


In [41]:
cancer_cell_gen_expr_model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 1)                0         
 ectorization)                                                   
                                                                 
 embedding_1 (Embedding)     (None, 1, 691)            389033    
                                                                 
 flatten_1 (Flatten)         (None, 691)               0         
                                                                 
Total params: 389,033
Trainable params: 0
Non-trainable params: 389,033
_________________________________________________________________
