In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import sys
import os
import xgboost as xgb 
from importlib import reload
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import mean_squared_error
from collections import namedtuple
import time
import pickle
import scipy
from sklearn.model_selection import RandomizedSearchCV

In [2]:
codebase_path = '/data/home/wpw035/Codebase'
sys.path.insert(0, codebase_path) #add path to my codebase models

In [3]:
#my moudles
from DRP_utils import data_preprocessing as dp_nb
reload(dp_nb)

from DRP_utils import testing as t_nb
reload(t_nb)
import Data_imports as di_nb
reload(di_nb)
import pairs_train_test_split as  tts_nb
reload(tts_nb)

import Learning_curve_xgboost as lc_nb
reload(lc_nb)


<module 'Learning_curve_xgboost' from '/data/home/wpw035/Drug_response_prediction/DRP-alpha-preliminary-results/XGboost/Unseen_cell_line_testing/Learning_curve_xgboost.py'>

In [4]:
#read in data
prot, rna, one_hot_cls, one_hot_drugs, ic50_df1 = di_nb.read_input_data()
_all_cls = prot.index
_all_drugs = ic50_df1.columns

  return func(*args, **kwargs)


Number of missing prot values 0.386335609896865
num non overlapping prot and target cls: 10
num non overlapping rna prot and target cls: 91


In [5]:
prot.shape, rna.shape, one_hot_cls.shape, one_hot_drugs.shape

((877, 8457), (877, 17417), (877, 877), (345, 345))

## Feature selection (FS) and creating data for each drug

### RNA FS

In [6]:
#read in landmark genes for fs and find landmarks that overlap with rna data
landmark_genes = pd.read_csv(
    f'{codebase_path}/downloaded_data_small/landmark_genes_LINCS.txt',sep='\t')
landmark_genes.index = landmark_genes['Symbol']

dft = pd.DataFrame(rna.columns.dropna())
dft.index = rna.columns.dropna()
dft = dft[dft.duplicated() == False]

overlapping_landmarks, _ = dp_nb.keep_overlapping(
    pd.DataFrame(landmark_genes['Symbol']), dft)

overlapping_landmarks = overlapping_landmarks['Symbol'].values

#create input data for each drug
x_all, x_drug, y_list = dp_nb.create_all_drugs(
    rna[overlapping_landmarks], one_hot_drugs, ic50_df1, _all_cls)

x_all = x_all.astype(np.float32)
x_drug = x_drug.astype(np.float16)

#fmt index to include drug cell line paris
cls_drugs_index = x_all.index + '::' + x_drug.index
x_all.index = cls_drugs_index
x_drug.index = cls_drugs_index
y_list.index = cls_drugs_index

x_all.shape, x_drug.shape, len(y_list)

((263375, 908), (263375, 345), 263375)

### Prot FS

In [7]:
#use the same landmark genes, that were used for fs for rna data
#for fs with prot data
#find overlapping landmark genes and prot features
dft = pd.DataFrame(prot.columns.dropna())
dft.index = prot.columns.dropna()
dft = dft[dft.duplicated() == False]

overlapping_landmarks, _ = dp_nb.keep_overlapping(
    pd.DataFrame(landmark_genes['Symbol']), dft)

overlapping_landmarks = overlapping_landmarks['Symbol'].values

#create prot data for all drugs
x_all_prot, x_drug, y_list = dp_nb.create_all_drugs(
    prot[overlapping_landmarks], one_hot_drugs, ic50_df1, _all_cls)

#fmt index to include drug cell line paris
cls_drugs_index = x_all_prot.index + '::' + x_drug.index 
x_all_prot.index = cls_drugs_index
y_list.index = cls_drugs_index
x_drug.index = cls_drugs_index

x_all_prot = x_all_prot.astype(np.float32)

### Create one-hot data for all drugs

In [8]:
x_hot, x_drug_hot, y_hot = dp_nb.create_all_drugs(
    one_hot_cls, one_hot_drugs, ic50_df1, _all_cls)

cls_drugs_index_hot = x_hot.index + '::' + x_drug_hot.index 

x_hot.index = cls_drugs_index_hot
x_hot.columns = np.arange(len(x_drug.columns) , len(x_hot.columns) + len(x_drug.columns))

# set hyper parm search space

In [9]:

params = { 'max_depth': [3, 5, 6, 10, 15, 20],
           'learning_rate': [0.1, 0.2, 0.3],
           'subsample':[0.7, 0.8, 0.9, 1.0],
           'colsample_bytree':[ 0.4, 0.6, 0.8, 1.0],
           'n_estimators': [25000]}



parm_grid = sklearn.model_selection.ParameterGrid(params)
x_hot = x_hot.astype(np.float32)

# Learning curves 

In [10]:
#just to get train size
rand_seed = 1
pairs_with_truth_vals =  y_list.index
train_pairs, test_pairs, val_pairs = tts_nb.split(
    rand_seed, _all_cls, _all_drugs, pairs_with_truth_vals)

#rna test train selection
x_train_rna, x_test_rna = x_all.loc[train_pairs], x_all.loc[test_pairs]

#set train size search space. 
lg_space = np.logspace(1, 17.6, base=2.0).astype(int)
lg_space = np.append(lg_space, len(x_train_rna))
lg_space = np.unique(lg_space)
lg_space

Fraction of cls in sets, relative to all clsbefore mising values are removed
train fraction 0.7993158494868872, test fraction 0.10034207525655645,validaiton fraciton 0.10034207525655645
------
Fraction of cls in sets, relative to all cl drug pairs, aftermising values are removed
train fraction 0.6972253895857089, test fraction0.08817939946788293, validaiton fraciton 0.0850693239469205


array([     2,      3,      4,      5,      6,      8,     10,     13,
           16,     20,     26,     33,     42,     53,     67,     85,
          108,    136,    173,    219,    277,    350,    443,    560,
          708,    896,   1133,   1433,   1813,   2293,   2900,   3668,
         4638,   5866,   7419,   9383,  11867,  15008,  18980,  24004,
        30358,  38393,  48555,  61407,  77660,  98216, 124212, 157089,
       198668, 210956])

# Learning curve runs

In [11]:
#finds a test train split then finds the learning curve
#for that split. Repeats for mutiple (N) test train splits 
N = 30

t1 = time.time()
for run in range(20, N):
    print(f'run {run} of {N}')
    #test train split
    rand_seed = 42 + run
    pairs_with_truth_vals =  y_list.index
    train_pairs, test_pairs, val_pairs = tts_nb.split(
        rand_seed, _all_cls, _all_drugs, pairs_with_truth_vals)

    #rna test train selection
    x_train_rna, x_test_rna = x_all.loc[train_pairs], x_all.loc[test_pairs]
    x_val_rna = x_all.loc[val_pairs]
    y_train, y_test = y_list[train_pairs], y_list[test_pairs]
    y_val = y_list[val_pairs]
    xdrug_train, xdrug_test = x_drug.loc[train_pairs], x_drug.loc[test_pairs]
    xdrug_val = x_drug.loc[val_pairs]

    #prot test train selection
    x_train_prot, x_test_prot = x_all_prot.loc[train_pairs], x_all_prot.loc[test_pairs]
    x_val_prot = x_all_prot.loc[val_pairs]

    #one hot test train seleciton
    x_train_hot, x_test_hot = x_hot.loc[train_pairs], x_hot.loc[test_pairs]
    x_val_hot = x_hot.loc[val_pairs]
    
    #concat of omic and drug data
    x_train_rna = pd.concat((x_train_rna, xdrug_train), axis=1)
    x_val_rna = pd.concat((x_val_rna, xdrug_val), axis=1)
    x_test_rna = pd.concat((x_test_rna, xdrug_test), axis=1)
    
    x_train_prot = pd.concat((x_train_prot, xdrug_train), axis=1)
    x_val_prot = pd.concat((x_val_prot, xdrug_val), axis=1)
    x_test_prot = pd.concat((x_test_prot, xdrug_test), axis=1)
    
    x_train_hot = pd.concat((x_train_hot, xdrug_train), axis=1)
    x_val_hot = pd.concat((x_val_hot, xdrug_val), axis=1)
    x_test_hot = pd.concat((x_test_hot, xdrug_test), axis=1)

    #consistencey checks
    assert (x_train_hot.index == x_train_rna.index).all()
    assert (x_test_hot.index == x_test_rna.index).all()
    assert (x_val_hot.index == x_val_rna.index).all()

    assert (x_train_prot.index == x_train_rna.index).all()
    assert (x_test_prot.index == x_test_rna.index).all()
    assert (x_val_prot.index == x_val_rna.index).all()

    assert (y_train.index == x_train_rna.index).all()
    assert (y_test.index == x_test_rna.index).all()
    assert (xdrug_test.index == x_test_rna.index).all()

    #inconsistencey checks
    assert x_train_rna.shape[1] != x_train_prot.shape[1]
    assert x_test_rna.shape[1] != x_test_prot.shape[1]
    assert x_val_rna.shape[1] != x_val_prot.shape[1]

    assert x_train_rna.shape[1] != x_train_hot.shape[1]
    assert x_test_rna.shape[1] != x_test_hot.shape[1]
    assert x_val_rna.shape[1] != x_val_hot.shape[1]

    assert x_train_prot.shape[1] != x_train_hot.shape[1]
    assert x_test_prot.shape[1] != x_test_hot.shape[1]
    
    del x_train_prot, x_val_prot, x_test_prot
    del x_train_rna, x_val_rna, x_test_rna
    

    data_type = 'One-hot'
    model_path = f'optimal-models/{data_type}/run{run}model_train_size_'
    mse_r2, bms, bhps = lc_nb.run_lc_xg_ucl(
        model_func=xgb.XGBRegressor,
        param_grid = parm_grid,
        xtrain=x_train_hot, 
        ytrain=y_train, 
        xval=x_val_hot, 
        yval=y_val, 
        xtest=x_test_hot,
        ytest=y_test, 
        train_sizes = lg_space, 
        num_trails=15,
        es_rounds=500,
        model_save_path=model_path
)

    #save data
    #--------- Dont overwrite exiting data ------------    
    mse_r2.to_csv(f'LC-metric-results/{data_type}/run{run}')
    d = {}
    for hp in bhps:
        for k, v in hp.items():
            if k in d.keys():
                d[k].append(v)
            else:
                d[k] = [v]
    bhps_df = pd.DataFrame(d)
    bhps_df.to_csv(f'Optimal-hyperparameters/{data_type}/run{run}df')
    with open(f'Optimal-hyperparameters/{data_type}/run{run}.pkl', 'wb') as f:
        pickle.dump(bhps, f)
        
    np.savetxt(f'test_train_cls/{data_type}/train_pairs{run}', train_pairs, fmt='%s')
    np.savetxt(f'test_train_cls/{data_type}/test_pairs{run}', test_pairs, fmt='%s')
    np.savetxt(f'test_train_cls/{data_type}/val_pairs{run}', val_pairs, fmt='%s')
    
    del bms, bhps
    
  

run 20 of 30
Fraction of cls in sets, relative to all clsbefore mising values are removed
train fraction 0.7993158494868872, test fraction 0.10034207525655645,validaiton fraciton 0.10034207525655645
------
Fraction of cls in sets, relative to all cl drug pairs, aftermising values are removed
train fraction 0.6939236197180771, test fraction0.08868177085915423, validaiton fraciton 0.08786872242328095
1 of 50 train sizes




2 of 50 train sizes




3 of 50 train sizes
4 of 50 train sizes
5 of 50 train sizes
6 of 50 train sizes
7 of 50 train sizes
8 of 50 train sizes
9 of 50 train sizes
10 of 50 train sizes
11 of 50 train sizes
12 of 50 train sizes
13 of 50 train sizes
14 of 50 train sizes
15 of 50 train sizes
16 of 50 train sizes
17 of 50 train sizes
18 of 50 train sizes
19 of 50 train sizes
20 of 50 train sizes
21 of 50 train sizes
22 of 50 train sizes
23 of 50 train sizes
24 of 50 train sizes
25 of 50 train sizes
26 of 50 train sizes
27 of 50 train sizes
28 of 50 train sizes
29 of 50 train sizes
30 of 50 train sizes
31 of 50 train sizes
32 of 50 train sizes
33 of 50 train sizes
34 of 50 train sizes
35 of 50 train sizes
36 of 50 train sizes
37 of 50 train sizes
38 of 50 train sizes
39 of 50 train sizes
40 of 50 train sizes
41 of 50 train sizes
42 of 50 train sizes
43 of 50 train sizes
44 of 50 train sizes
45 of 50 train sizes
46 of 50 train sizes
47 of 50 train sizes
48 of 50 train sizes
49 of 50 train sizes
50 of 50 train sizes



2 of 50 train sizes
3 of 50 train sizes
4 of 50 train sizes
5 of 50 train sizes
6 of 50 train sizes
7 of 50 train sizes
8 of 50 train sizes
9 of 50 train sizes
10 of 50 train sizes
11 of 50 train sizes
12 of 50 train sizes
13 of 50 train sizes
14 of 50 train sizes
15 of 50 train sizes
16 of 50 train sizes
17 of 50 train sizes
18 of 50 train sizes
19 of 50 train sizes
20 of 50 train sizes
21 of 50 train sizes
22 of 50 train sizes
23 of 50 train sizes
24 of 50 train sizes
25 of 50 train sizes
26 of 50 train sizes
27 of 50 train sizes
28 of 50 train sizes
29 of 50 train sizes
30 of 50 train sizes
31 of 50 train sizes
32 of 50 train sizes
33 of 50 train sizes
34 of 50 train sizes
35 of 50 train sizes
36 of 50 train sizes
37 of 50 train sizes
38 of 50 train sizes
39 of 50 train sizes
40 of 50 train sizes
41 of 50 train sizes
42 of 50 train sizes
43 of 50 train sizes
44 of 50 train sizes
45 of 50 train sizes
46 of 50 train sizes
47 of 50 train sizes
48 of 50 train sizes
49 of 50 train sizes




2 of 50 train sizes




3 of 50 train sizes
4 of 50 train sizes
5 of 50 train sizes
6 of 50 train sizes
7 of 50 train sizes
8 of 50 train sizes
9 of 50 train sizes
10 of 50 train sizes
11 of 50 train sizes
12 of 50 train sizes
13 of 50 train sizes
14 of 50 train sizes
15 of 50 train sizes
16 of 50 train sizes
17 of 50 train sizes
18 of 50 train sizes
19 of 50 train sizes
20 of 50 train sizes
21 of 50 train sizes
22 of 50 train sizes
23 of 50 train sizes
24 of 50 train sizes
25 of 50 train sizes
26 of 50 train sizes
27 of 50 train sizes
28 of 50 train sizes
29 of 50 train sizes
30 of 50 train sizes
31 of 50 train sizes
32 of 50 train sizes
33 of 50 train sizes
34 of 50 train sizes
35 of 50 train sizes
36 of 50 train sizes
37 of 50 train sizes
38 of 50 train sizes
39 of 50 train sizes
40 of 50 train sizes
41 of 50 train sizes
42 of 50 train sizes
43 of 50 train sizes
44 of 50 train sizes
45 of 50 train sizes
46 of 50 train sizes
47 of 50 train sizes
48 of 50 train sizes
49 of 50 train sizes
50 of 50 train sizes



2 of 50 train sizes
3 of 50 train sizes
4 of 50 train sizes
5 of 50 train sizes
6 of 50 train sizes
7 of 50 train sizes
8 of 50 train sizes
9 of 50 train sizes
10 of 50 train sizes
11 of 50 train sizes
12 of 50 train sizes
13 of 50 train sizes
14 of 50 train sizes
15 of 50 train sizes
16 of 50 train sizes
17 of 50 train sizes
18 of 50 train sizes
19 of 50 train sizes
20 of 50 train sizes
21 of 50 train sizes
22 of 50 train sizes
23 of 50 train sizes
24 of 50 train sizes
25 of 50 train sizes
26 of 50 train sizes
27 of 50 train sizes
28 of 50 train sizes
29 of 50 train sizes
30 of 50 train sizes
31 of 50 train sizes
32 of 50 train sizes
33 of 50 train sizes
34 of 50 train sizes
35 of 50 train sizes
36 of 50 train sizes
37 of 50 train sizes
38 of 50 train sizes
39 of 50 train sizes
40 of 50 train sizes
41 of 50 train sizes
42 of 50 train sizes
43 of 50 train sizes
44 of 50 train sizes
45 of 50 train sizes
46 of 50 train sizes
47 of 50 train sizes
48 of 50 train sizes
49 of 50 train sizes




2 of 50 train sizes
3 of 50 train sizes
4 of 50 train sizes
5 of 50 train sizes
6 of 50 train sizes
7 of 50 train sizes
8 of 50 train sizes
9 of 50 train sizes
10 of 50 train sizes
11 of 50 train sizes
12 of 50 train sizes
13 of 50 train sizes
14 of 50 train sizes
15 of 50 train sizes
16 of 50 train sizes
17 of 50 train sizes
18 of 50 train sizes
19 of 50 train sizes
20 of 50 train sizes
21 of 50 train sizes
22 of 50 train sizes
23 of 50 train sizes
24 of 50 train sizes
25 of 50 train sizes
26 of 50 train sizes
27 of 50 train sizes
28 of 50 train sizes
29 of 50 train sizes
30 of 50 train sizes
31 of 50 train sizes
32 of 50 train sizes
33 of 50 train sizes
34 of 50 train sizes
35 of 50 train sizes
36 of 50 train sizes
37 of 50 train sizes
38 of 50 train sizes
39 of 50 train sizes
40 of 50 train sizes
41 of 50 train sizes
42 of 50 train sizes
43 of 50 train sizes
44 of 50 train sizes
45 of 50 train sizes
46 of 50 train sizes
47 of 50 train sizes
48 of 50 train sizes
49 of 50 train sizes




3 of 50 train sizes
4 of 50 train sizes
5 of 50 train sizes




6 of 50 train sizes
7 of 50 train sizes
8 of 50 train sizes
9 of 50 train sizes




10 of 50 train sizes
11 of 50 train sizes
12 of 50 train sizes
13 of 50 train sizes
14 of 50 train sizes
15 of 50 train sizes
16 of 50 train sizes
17 of 50 train sizes
18 of 50 train sizes
19 of 50 train sizes
20 of 50 train sizes
21 of 50 train sizes
22 of 50 train sizes
23 of 50 train sizes
24 of 50 train sizes
25 of 50 train sizes
26 of 50 train sizes
27 of 50 train sizes
28 of 50 train sizes
29 of 50 train sizes
30 of 50 train sizes
31 of 50 train sizes
32 of 50 train sizes
33 of 50 train sizes
34 of 50 train sizes
35 of 50 train sizes
36 of 50 train sizes
37 of 50 train sizes
38 of 50 train sizes
39 of 50 train sizes
40 of 50 train sizes
41 of 50 train sizes
42 of 50 train sizes
43 of 50 train sizes
44 of 50 train sizes
45 of 50 train sizes
46 of 50 train sizes
47 of 50 train sizes
48 of 50 train sizes
49 of 50 train sizes
50 of 50 train sizes
total time elapsed (s)
9489.897300481796
run 28 of 30
Fraction of cls in sets, relative to all clsbefore mising values are removed
train fr

In [16]:
delt = time.time() - t1
delt

92003.01188540459

In [17]:
92003.01188540459/ (60*60)

25.556392190390163