In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import sklearn
import sys
import os
import time
import scipy
import pickle
from importlib import reload
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers
from sklearn.metrics import r2_score
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import mean_squared_error
from collections import namedtuple
import keras_tuner as kt 
codebase_path = '/data/home/wpw035/Codebase'
sys.path.insert(0, codebase_path) #add path to my codebase models
from scipy.optimize import curve_fit

In [2]:
from DRP_utils import data_preprocessing as dp_nb
reload(dp_nb)
from DRP_utils import model_selection as ms_nb
reload(ms_nb)
from DRP_utils import testing as t_nb
reload(t_nb)
import Data_imports as di_nb
reload(di_nb)
import pairs_train_test_split as tts_nb
import Learning_curve as lc_nb

In [3]:
#input data
prot, rna, phospho_ls, one_hot_cls, one_hot_drugs, ic50_df1 = di_nb.read_input_data()
_all_cls = prot.index
_all_drugs = ic50_df1.columns
assert prot.shape[0] == rna.shape[0] == phospho_ls.shape[0]
assert phospho_ls.shape[0]  == one_hot_cls.shape[0]
prot.shape, rna.shape, phospho_ls.shape, one_hot_cls.shape, one_hot_drugs.shape

  return func(*args, **kwargs)


Number of missing prot values 0.386335609896865
num non overlapping prot and target cls: 10
num non overlapping cls: 930


((38, 8457), (38, 17417), (38, 22804), (38, 38), (345, 345))

## Featrue selection (FS) and data createing for each drug

In [4]:
#read in landmark genes for fs and find landmarks that overlap with rna data
landmark_genes = pd.read_csv(
    f'{codebase_path}/downloaded_data_small/landmark_genes_LINCS.txt',sep='\t')
landmark_genes.index = landmark_genes['Symbol']

overlapping_landmarks, _ = dp_nb.keep_overlapping(
    pd.DataFrame(landmark_genes['Symbol']), rna.T)

overlapping_landmarks = overlapping_landmarks['Symbol'].values

#create input data for each drug
x_all, x_drug, y_list = dp_nb.create_all_drugs(
    rna[overlapping_landmarks], one_hot_drugs, ic50_df1, _all_cls)

x_all = x_all.astype(np.float32)
x_drug = x_drug.astype(np.float16)

#fmt index to include drug cell line paris
cls_drugs_index = x_all.index + '::' + x_drug.index
x_all.index = cls_drugs_index
x_drug.index = cls_drugs_index
y_list.index = cls_drugs_index

x_all.shape, x_drug.shape, len(y_list)

((11583, 908), (11583, 345), 11583)

In [5]:
#use the same landmark genes, that were used for fs for rna datan
#for fs with prot data

#find overlapping landmark genes and prot features
overlapping_landmarks, _ = dp_nb.keep_overlapping(
    pd.DataFrame(landmark_genes['Symbol']), prot.T)

overlapping_landmarks = overlapping_landmarks['Symbol'].values

#create prot data for all drugs
x_all_prot, x_drug, y_list = dp_nb.create_all_drugs(
    prot[overlapping_landmarks], one_hot_drugs, ic50_df1, _all_cls)

#fmt index to include drug cell line paris
cls_drugs_index = x_all_prot.index + '::' + x_drug.index 
x_all_prot.index = cls_drugs_index
y_list.index = cls_drugs_index
x_drug.index = cls_drugs_index

x_all_prot = x_all_prot.astype(np.float32)
x_all_prot.shape

(11583, 721)

In [6]:
#one hot data creation for all drugs
x_hot, x_drug_hot, y_hot = dp_nb.create_all_drugs(
    one_hot_cls, one_hot_drugs, ic50_df1, _all_cls)

cls_drugs_index_hot = x_hot.index + '::' + x_drug_hot.index 

x_hot.index = cls_drugs_index_hot
x_hot.columns = np.arange(len(x_drug.columns), len(x_drug.columns) + len(x_hot.columns))
x_hot.shape

(11583, 38)

## Model building

In [7]:
_input_shape=None
def build_cnn_kt(hp):
    if _input_shape == None:
        raise Exception('add input shape dim')
    phos_input = layers.Input(shape=(_input_shape, 1))
    x = layers.Conv1D(
        filters=hp.Int('filts', 8, 32, 8), kernel_size=16, 
        activation='relu')(phos_input)
    x = layers.MaxPooling1D(pool_size=2)(x)
    x = layers.Conv1D(
        filters=hp.Int('filts',8, 32, 8), kernel_size=8, activation='relu')(x)
    x = layers.MaxPooling1D(pool_size=2)(x)
    x = layers.Flatten()(x)
    x = layers.Dense(hp.Int('units', 32, 258, 32), activation='relu')(x)
    x = layers.Dense(hp.Int('units', 32, 258, 32), activation='relu')(x)
    drug_input = layers.Input(shape = (xdrug_train.shape[1]))
    concatenated = layers.concatenate([x, drug_input])
    hidd = layers.Dense(hp.Int('units_hid', 32, 258, 32), activation='relu')(concatenated)
    hidd = layers.Dense(hp.Int('units_hid', 32, 258, 32), activation='relu')(hidd)
    output_tensor = layers.Dense(1)(hidd)
    model = tf.keras.Model([phos_input,drug_input], output_tensor)
    opt = tf.keras.optimizers.RMSprop(learning_rate=hp.Choice('lr', [1e-4, 1e-3]))
    model.compile(optimizer=opt, loss=tf.keras.metrics.mean_squared_error, metrics=['mae'])
    return model

In [8]:
_train_size  = 0.6 #train size relative to total data set size
lg_space = np.logspace(1, np.log2(64), base=2.0, num=5).astype(int)
lg2 = np.logspace(np.log2(64), np.log2(len(x_all) * _train_size),  base=2.0, num=50).astype(int)
lg_space = np.concatenate((lg_space, lg2))
lg_space = np.unique(lg_space)
lg_space

array([   2,    4,   11,   26,   64,   70,   77,   85,   93,  103,  113,
        125,  137,  151,  166,  183,  201,  221,  244,  268,  295,  325,
        358,  394,  433,  477,  525,  577,  635,  699,  769,  847,  932,
       1025, 1128, 1242, 1366, 1503, 1654, 1821, 2003, 2205, 2426, 2669,
       2937, 3232, 3557, 3914, 4307, 4740, 5215, 5739, 6315, 6949])

In [None]:
#One hot
#finds a test train split then finds the learning curve
#for that split. Repeats for mutiple (N) test train splits 
N = 30
t1 = time.time()
for run in range(N):
    print(f'run {run} of {N}')
    #test train split
    rand_seed = 42 + run
    pairs_with_truth_vals =  y_list.index
    train_pairs, test_pairs, val_pairs = tts_nb.split(
        rand_seed, _all_cls, _all_drugs, pairs_with_truth_vals,
        train_size=_train_size)

    #rna test train selection
    x_train_rna, x_test_rna = x_all.loc[train_pairs], x_all.loc[test_pairs]
    x_val_rna = x_all.loc[val_pairs]
    y_train, y_test = y_list[train_pairs], y_list[test_pairs]
    y_val = y_list[val_pairs]
    xdrug_train, xdrug_test = x_drug.loc[train_pairs], x_drug.loc[test_pairs]
    xdrug_val = x_drug.loc[val_pairs]

    #prot test train selection
    x_train_prot, x_test_prot = x_all_prot.loc[train_pairs], x_all_prot.loc[test_pairs]
    x_val_prot = x_all_prot.loc[val_pairs]

    #one hot test train seleciton
    x_train_hot, x_test_hot = x_hot.loc[train_pairs], x_hot.loc[test_pairs]
    x_val_hot = x_hot.loc[val_pairs]
    
    #consistencey checks
    assert (x_train_hot.index == x_train_rna.index).all()
    assert (x_test_hot.index == x_test_rna.index).all()
    assert (x_val_hot.index == x_val_rna.index).all()

    assert (x_train_prot.index == x_train_rna.index).all()
    assert (x_test_prot.index == x_test_rna.index).all()
    assert (x_val_prot.index == x_val_rna.index).all()

    assert (y_train.index == x_train_rna.index).all()
    assert (y_test.index == x_test_rna.index).all()
    assert (xdrug_test.index == x_test_rna.index).all()

    #inconsistencey checks
    assert x_train_rna.shape[1] != x_train_prot.shape[1]
    assert x_test_rna.shape[1] != x_test_prot.shape[1]
    assert x_val_rna.shape[1] != x_val_prot.shape[1]

    assert x_train_rna.shape[1] != x_train_hot.shape[1]
    assert x_test_rna.shape[1] != x_test_hot.shape[1]
    assert x_val_rna.shape[1] != x_val_hot.shape[1]

    assert x_train_prot.shape[1] != x_train_hot.shape[1]
    assert x_test_prot.shape[1] != x_test_hot.shape[1]

    del x_train_rna, x_val_rna, x_test_rna
    del x_train_prot, x_val_prot, x_test_prot
    
    data_type = 'One-hot'
    #run the learning curve
    _input_shape = x_train_hot.shape[1]
    mse_r2, bms, bhps = lc_nb.run_lc_ucl(
        build_cnn_kt,
        [x_train_hot, xdrug_train], 
        y_train, 
        [x_val_hot, xdrug_val], 
        y_val, 
        [x_test_hot, xdrug_test],
        y_test, 
        lg_space, 
        num_trails=15,
        epochs=100,
        direc='UCL-del2')
    
    #save data
    mse_r2.to_csv(f'LC-metric-results/{data_type}/run{run}')
    
    bhps_df = pd.DataFrame([bhp.values for bhp in bhps])
    bhps_df.to_csv(f'Optimal-hyperparameters/{data_type}/run{run}df')
    with open(f'Optimal-hyperparameters/{data_type}/run{run}.pkl', 'wb') as f:
        pickle.dump(bhps, f)
        
    model_path = f'optimal-models{data_type}/run{run}/model_train_size_'
    for train_size, model in zip(lg_space, bms):
        model.save(model_path + str(train_size)) 
        
    np.savetxt(f'train_test_inds/{data_type}/train_inds{run}', y_train.index, fmt='%s')
    np.savetxt(f'train_test_inds/{data_type}/test_inds{run}', y_test.index, fmt='%s')
    np.savetxt(f'train_test_inds/{data_type}/val_inds{run}', y_val.index, fmt='%s')
    
delt = time.time() - t1

Trial 5 Complete [00h 00m 03s]
val_loss: 5.201609134674072

Best val_loss So Far: 5.201609134674072
Total elapsed time: 00h 00m 44s

Search: Running Trial #6

Hyperparameter    |Value             |Best Value So Far 
filts             |16                |24                
units             |160               |224               
units_hid         |96                |224               
lr                |0.0001            |0.001             

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch