In [8]:
from recover.datasets.drugcomb_matrix_data import (DrugCombMatrix, DrugCombMatrixDrugLevelSplitTrain,
DrugCombMatrixDrugLevelSplitTest, DrugCombMatrixTrainOneil, DrugCombMatrixTestAlmanac,
DrugCombMatrixOneHiddenDrugSplitTrain, DrugCombMatrixOneHiddenDrugSplitTest)
import torch
import numpy as np
from sklearn.svm import LinearSVR
from scipy.stats import spearmanr
from scipy import stats
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [9]:
def get_set(dataset, indices, edge_idx, cell_line_option):
    assert cell_line_option in ["no", "index", "features"]
    
    if cell_line_option == "no":
        out_set = torch.cat((dataset.data.x_drugs[edge_idx[0]], dataset.data.x_drugs[edge_idx[1]]), dim=1).numpy()
    elif cell_line_option == "index":
        set_cl_one_hot = torch.nn.functional.one_hot(dataset.data.ddi_edge_classes)[indices]
        out_set = torch.cat((dataset.data.x_drugs[edge_idx[0]], 
                             dataset.data.x_drugs[edge_idx[1]],
                             set_cl_one_hot), dim=1).numpy()
    else:
        set_cl_indices = dataset.data.ddi_edge_classes[indices]
        set_cl_features = dataset.data.cell_line_features[set_cl_indices]
        
        out_set = torch.cat((dataset.data.x_drugs[edge_idx[0]], 
                     dataset.data.x_drugs[edge_idx[1]],
                     set_cl_features), dim=1).numpy()
        
    return out_set

In [10]:
def evaluate_model(model, train_dataset, test_dataset, test_bliss, config, cell_line_option="no"):
    all_test_r2 = []
    all_test_spear = []
    all_val_r2 = []
    all_val_spear = []

    for seed in [2, 3, 4]:
        torch.manual_seed(seed)
        np.random.seed(seed)

        train_idxs, val_idxs, _ = train_dataset.random_split(config)
        train_edge_idx = train_dataset.data.ddi_edge_idx[:, train_idxs]
        val_edge_idx = train_dataset.data.ddi_edge_idx[:, val_idxs]

        train_bliss = train_dataset.data.ddi_edge_bliss_max[train_idxs].numpy()
        val_bliss = train_dataset.data.ddi_edge_bliss_max[val_idxs].numpy()

        train_set = get_set(train_dataset, train_idxs, train_edge_idx, cell_line_option)
        val_set = get_set(train_dataset, val_idxs, val_edge_idx, cell_line_option)

        model.fit(train_set, train_bliss)

        val_preds = model.predict(val_set)
        all_val_r2.append(stats.linregress(val_preds, val_bliss).rvalue**2)
        all_val_spear.append(spearmanr(val_bliss, val_preds).correlation)

        test_preds = model.predict(test_dataset)
        all_test_r2.append(stats.linregress(test_preds, test_bliss).rvalue**2)
        all_test_spear.append(spearmanr(test_bliss, test_preds).correlation)

    print("val R2", np.mean(all_val_r2), np.std(all_val_r2))
    print("val Spear", np.mean(all_val_spear), np.std(all_val_spear))

    print("test R2", np.mean(all_test_r2), np.std(all_test_r2))
    print("test Spear", np.mean(all_test_spear), np.std(all_test_spear))

## Pair level split

In [4]:
dataset = DrugCombMatrix(
    study_name= 'ALMANAC',
    cell_line= 'MCF7',
    fp_bits= 1024,
    fp_radius= 2
)

Dataset loaded.
4463 drug comb experiments among 149 drugs
	 fingeprints with radius 2 and nbits 1024
	 drug features dimension 1173
	 1 cell-lines


In [5]:
# Set seed
seed = 1
torch.manual_seed(seed)
np.random.seed(seed)

# Split
config = {"val_set_prop": 0.2, 
          "test_set_prop": 0.1, 
          "split_valid_train": "pair_level",
          "test_on_unseen_cell_line": False, 
          "target": "bliss_max"}

# The test set does not depend on the seed
_, _, test_idxs = dataset.random_split(config)
test_edge_idx = dataset.data.ddi_edge_idx[:, test_idxs]
test_bliss = dataset.data.ddi_edge_bliss_max[test_idxs].numpy()

test_set = torch.cat((dataset.data.x_drugs[test_edge_idx[0]], dataset.data.x_drugs[test_edge_idx[1]]), 
                    dim=1).numpy()

In [6]:
comb_svm = LinearSVR(max_iter=1000000, tol=0.001, C=0.1, random_state=0)
evaluate_model(comb_svm, dataset, test_set, test_bliss, config)

val R2 0.2370352295509844 0.004864897661955212
val Spear 0.4932619593352429 0.01432980534441483
test R2 0.17143226478348375 0.006290679844989228
test Spear 0.4527530524666828 0.004408367459766309


In [7]:
boosting_tree = GradientBoostingRegressor(n_estimators=100,
          max_depth=20,
          min_samples_split=20,
          learning_rate=0.1,
          max_features='sqrt',
          loss='ls')
evaluate_model(boosting_tree, dataset, test_set, test_bliss, config)

val R2 0.2761292458780706 0.02186719187641223
val Spear 0.4843307636504904 0.0333374209706798
test R2 0.17240154350005946 0.002411191048605907
test Spear 0.45891524869772354 0.0013972159101678335


## Drug Level Split

In [8]:
dataset_train = DrugCombMatrixDrugLevelSplitTrain(
    study_name= 'ALMANAC',
    cell_line= 'MCF7',
    fp_bits= 1024,
    fp_radius= 2
)

dataset_test= DrugCombMatrixDrugLevelSplitTest(
    study_name= 'ALMANAC',
    cell_line= 'MCF7',
    fp_bits= 1024,
    fp_radius= 2
)

Dataset loaded.
4463 drug comb experiments among 149 drugs
	 fingeprints with radius 2 and nbits 1024
	 drug features dimension 1173
	 1 cell-lines
2328 drug comb experiments among 149 drugs
Dataset loaded.
4463 drug comb experiments among 149 drugs
	 fingeprints with radius 2 and nbits 1024
	 drug features dimension 1173
	 1 cell-lines
326 drug comb experiments among 149 drugs


In [9]:
# Set seed
seed = 1
torch.manual_seed(seed)
np.random.seed(seed)

# Split
config = {"val_set_prop": 0.2, 
          "test_set_prop": 0., 
          "split_valid_train": "pair_level",
          "test_on_unseen_cell_line": False, 
          "target": "bliss_max"}

test_edge_idx = dataset_test.data.ddi_edge_idx
test_bliss = dataset_test.data.ddi_edge_bliss_max.numpy()

test_set = torch.cat((dataset_test.data.x_drugs[test_edge_idx[0]], dataset_test.data.x_drugs[test_edge_idx[1]]), 
                    dim=1).numpy()

In [10]:
comb_svm = LinearSVR(max_iter=1000000, tol=0.001, C=0.1, random_state=0)
evaluate_model(comb_svm, dataset_train, test_set, test_bliss, config)

val R2 0.2277599554734043 0.05540620479967322
val Spear 0.45508214673704256 0.04482903657423826
test R2 0.025555430013379062 0.00773491235484139
test Spear 0.10568538198923554 0.014788801436152032


In [11]:
boosting_tree = GradientBoostingRegressor(n_estimators=100,
          max_depth=20,
          min_samples_split=20,
          learning_rate=0.1,
          max_features='sqrt',
          loss='ls')
evaluate_model(boosting_tree, dataset_train, test_set, test_bliss, config)

val R2 0.2726274644443823 0.06460262039684045
val Spear 0.4303426820056275 0.022202853109618698
test R2 0.024131333158824746 0.0024849659802318224
test Spear 0.13660704165569318 0.015347045715059095


## Multi Cell Line

In [12]:
dataset = DrugCombMatrix(
    study_name= 'ALMANAC',
    cell_line= None,
    fp_bits= 1024,
    fp_radius= 2
)

# Set seed
seed = 1
torch.manual_seed(seed)
np.random.seed(seed)

config = {"val_set_prop": 0.2, 
          "test_set_prop": 0.1, 
          "split_valid_train": "pair_level",
          "test_on_unseen_cell_line": False, 
          "target": "bliss_max"}

_, _, test_idxs = dataset.random_split(config)

Dataset loaded.
200793 drug comb experiments among 149 drugs
	 fingeprints with radius 2 and nbits 1024
	 drug features dimension 1173
	 46 cell-lines


In [13]:
# Restrict the test set to MCF7
mcf7_idxs = np.where(np.array(dataset.data.ddi_edge_classes) == dataset.data.cell_line_to_idx_dict['MCF7'])[0]

mcf7_test_idxs = list(set(test_idxs.numpy()).intersection(mcf7_idxs))
mcf7_test_idxs = torch.Tensor(mcf7_test_idxs).long()


In [14]:
test_edge_idx = dataset.data.ddi_edge_idx[:, mcf7_test_idxs]
test_bliss = dataset.data.ddi_edge_bliss_max[mcf7_test_idxs].numpy()

test_set = get_set(dataset, mcf7_test_idxs, test_edge_idx, cell_line_option="index")

In [15]:
comb_svm = LinearSVR(max_iter=1000000, tol=0.001, C=0.1, random_state=0)
evaluate_model(comb_svm, dataset, test_set, test_bliss, config, cell_line_option="index")

val R2 0.15687447102446284 0.05120361637727491
val Spear 0.3712164342800531 0.011225365452072983
test R2 0.15135112725670463 0.0028435251519886356
test Spear 0.308100835071679 0.002555561622346465


In [16]:
boosting_tree = GradientBoostingRegressor(n_estimators=100,
          max_depth=20,
          min_samples_split=20,
          learning_rate=0.1,
          max_features='sqrt',
          loss='ls')
evaluate_model(boosting_tree, dataset, test_set, test_bliss, config, cell_line_option="index")

val R2 0.2937267014580645 0.05625799071479229
val Spear 0.47437323994331154 0.010332947860721316
test R2 0.21188338021130979 0.008242252557436939
test Spear 0.36990030536336316 0.022811765086414196


## Cell Line Transfer

In [17]:
dataset = DrugCombMatrix(
    study_name= 'ALMANAC',
    cell_line= None,
    fp_bits= 1024,
    fp_radius= 2
)

# Set seed
seed = 1
torch.manual_seed(seed)
np.random.seed(seed)

config = {"val_set_prop": 0.2, 
          "test_set_prop": 0.1, 
          "split_valid_train": "pair_level",
          "test_on_unseen_cell_line": True, 
          "cell_line": None,
          "cell_lines_in_test": ['MCF7'],
          "target": "bliss_max"}

_, _, test_idxs = dataset.random_split(config)

Dataset loaded.
200793 drug comb experiments among 149 drugs
	 fingeprints with radius 2 and nbits 1024
	 drug features dimension 1173
	 46 cell-lines


In [18]:
test_edge_idx = dataset.data.ddi_edge_idx[:, test_idxs]
test_bliss = dataset.data.ddi_edge_bliss_max[test_idxs].numpy()
test_set = get_set(dataset, test_idxs, test_edge_idx, cell_line_option="features")

In [19]:
comb_svm = LinearSVR(max_iter=1000000, tol=0.001, C=0.1, random_state=0)
evaluate_model(comb_svm, dataset, test_set, test_bliss, config, cell_line_option="features")

val R2 0.14213931806598878 0.02021737627794062
val Spear 0.3725804722605914 0.007136353635589427
test R2 0.1885035935984798 0.0033713047513444384
test Spear 0.34449785924288107 0.0021887385140420125


In [20]:
boosting_tree = GradientBoostingRegressor(n_estimators=100,
          max_depth=20,
          min_samples_split=20,
          learning_rate=0.1,
          max_features='sqrt',
          loss='ls')
evaluate_model(boosting_tree, dataset, test_set, test_bliss, config, cell_line_option="features")

val R2 0.30983919988222214 0.013704751166545315
val Spear 0.5071294111511242 0.007686993828352864
test R2 0.38317577942456876 0.016551522027221773
test Spear 0.40563156767350317 0.015099429620042964


## Study Transfer

In [21]:
dataset_train = DrugCombMatrixTrainOneil(
    study_name= 'ONEIL',
    cell_line= None,
    fp_bits= 1024,
    fp_radius= 2
)

dataset_test = DrugCombMatrixTestAlmanac(
    study_name= 'ALMANAC',
    cell_line= None,
    fp_bits= 1024,
    fp_radius= 2
)

# Set seed
seed = 1
torch.manual_seed(seed)
np.random.seed(seed)

config = {"val_set_prop": 0.2, 
          "test_set_prop": 0., 
          "split_valid_train": "pair_level",
          "test_on_unseen_cell_line": False, 
          "cell_line": None,
          "target": "bliss_max"}


test_edge_idx = dataset_test.data.ddi_edge_idx
test_bliss = dataset_test.data.ddi_edge_bliss_max.numpy()

# Dummy test indices
test_idxs = torch.arange(dataset_test.data.ddi_edge_idx.shape[1])

test_set = get_set(dataset_test, test_idxs, test_edge_idx, cell_line_option="features")

Dataset loaded.
16785 drug comb experiments among 129 drugs
	 fingeprints with radius 2 and nbits 1024
	 drug features dimension 1153
	 9 cell-lines
keep only fingerprint features
Dataset loaded.
302 drug comb experiments among 111 drugs
	 fingeprints with radius 2 and nbits 1024
	 drug features dimension 1135
	 9 cell-lines
keep only fingerprint features


In [22]:
comb_svm = LinearSVR(max_iter=1000000, tol=0.001, C=0.1, random_state=0)
evaluate_model(comb_svm, dataset_train, test_set, test_bliss, config, cell_line_option="features")

val R2 0.15306851234181687 0.012862172307966782
val Spear 0.4208073444887846 0.009056934300789557
test R2 0.005022809695337565 0.0009581576629098544
test Spear 0.10409909151163156 0.004002457198152873


In [23]:
boosting_tree = GradientBoostingRegressor(n_estimators=100,
          max_depth=20,
          min_samples_split=20,
          learning_rate=0.1,
          max_features='sqrt',
          loss='ls')
evaluate_model(boosting_tree, dataset_train, test_set, test_bliss, config, cell_line_option="features")

val R2 0.31098213131066216 0.005932088544277264
val Spear 0.5797525329527266 0.010465217950405982
test R2 0.000519547238353456 0.0006411019457136357
test Spear 0.04364820712863555 0.000945152142126725


## One hidden drug

In [11]:
dataset_train = DrugCombMatrixOneHiddenDrugSplitTrain(
    study_name= 'ALMANAC',
    cell_line= 'MCF7',
    fp_bits= 1024,
    fp_radius= 2
)

dataset_test= DrugCombMatrixOneHiddenDrugSplitTest(
    study_name= 'ALMANAC',
    cell_line= 'MCF7',
    fp_bits= 1024,
    fp_radius= 2
)

Dataset loaded.
4463 drug comb experiments among 149 drugs
	 fingeprints with radius 2 and nbits 1024
	 drug features dimension 1173
	 1 cell-lines
3930 drug comb experiments among 149 drugs
Dataset loaded.
4463 drug comb experiments among 149 drugs
	 fingeprints with radius 2 and nbits 1024
	 drug features dimension 1173
	 1 cell-lines
1981 drug comb experiments among 149 drugs


In [12]:
# Set seed
seed = 1
torch.manual_seed(seed)
np.random.seed(seed)

# Split
config = {"val_set_prop": 0.2, 
          "test_set_prop": 0., 
          "split_valid_train": "pair_level",
          "test_on_unseen_cell_line": False, 
          "target": "bliss_max"}

test_edge_idx = dataset_test.data.ddi_edge_idx
test_bliss = dataset_test.data.ddi_edge_bliss_max.numpy()

test_set = torch.cat((dataset_test.data.x_drugs[test_edge_idx[0]], dataset_test.data.x_drugs[test_edge_idx[1]]), 
                    dim=1).numpy()

In [13]:
comb_svm = LinearSVR(max_iter=1000000, tol=0.001, C=0.1, random_state=0)
evaluate_model(comb_svm, dataset_train, test_set, test_bliss, config)

val R2 0.22144186449256154 0.040409993857638694
val Spear 0.4398062851576692 0.02418015182710733
test R2 0.1669889215856694 0.005142345821065832
test Spear 0.33320473422337327 0.0016895052056015946


In [14]:
boosting_tree = GradientBoostingRegressor(n_estimators=100,
          max_depth=20,
          min_samples_split=20,
          learning_rate=0.1,
          max_features='sqrt',
          loss='ls')
evaluate_model(boosting_tree, dataset_train, test_set, test_bliss, config)

val R2 0.31972088159892426 0.08359629814948578
val Spear 0.47500654202940473 0.041825293802971515
test R2 0.18385353107878677 0.010574875021192177
test Spear 0.36152551120849435 0.0016098157523196878
