### Test functions for DRP_nb module

In [27]:
import os
import torch
import numpy as np
import pandas as pd
from importlib import reload
import torch_geometric.data as tgd

In [2]:
from DRP_nb import data_imports, feature_selection, utils, splitting

In [38]:
reload(data_imports)
reload(feature_selection)
reload(utils)
reload(splitting)

<module 'DRP_nb.splitting' from '/data/home/wpw035/Codebase/DRP_nb/splitting.py'>

## Data imports

In [23]:
#input phos prot rna (ppr) data
inp_ppr = data_imports.DrpInputData(omic_types=['phos'], drug_rep='mol_graph')
#take out disjoint cls
inp_ppr.remove_disjoint()
inp_ppr

DrpInputData, ['phos'] omics, mol_graph drug representation

## Feature selection and create data for all drugs
here using ladmarks targets that are also ladmarks (ltl)

In [24]:
ltl = feature_selection.ltl(inp_ppr.phos.columns)
x_all_phos, x_drug, y_list = utils.create_all_drugs(
    inp_ppr.phos, inp_ppr.marker_drugs, inp_ppr.y_df)

_all_cls = inp_ppr.phos.index
_all_drugs = inp_ppr.all_drugs

## Data splitting and putting data in dataloaders

In [25]:
pairs_with_truth_vals = y_list.index
batch_size = 512
train_size = 0.8
rand_seed = 42

train_pairs, test_pairs = splitting.split(
    rand_seed, _all_cls, _all_drugs, pairs_with_truth_vals,
    train_size=train_size, split_type='cblind')

test_cls = np.unique([cl.split('::')[0] for cl in test_pairs])
val_pairs, test_pairs = splitting.split(
    rand_seed, pd.Index(test_cls), _all_drugs, test_pairs,
    train_size=0.5, split_type='cblind') 

xo_train_phos = x_all_phos.loc[train_pairs]
xo_val_phos = x_all_phos.loc[val_pairs]
xo_test_phos = x_all_phos.loc[test_pairs]

xd_train = x_drug.loc[train_pairs]
xd_val = x_drug.loc[val_pairs]
xd_test = x_drug.loc[test_pairs]


y_train = y_list[train_pairs]
y_val = y_list[val_pairs]
y_test = y_list[test_pairs]

Fraction of cls in sets, relative to all clsbefore mising values are removed
train fraction 0.7857142857142857, test fraction 0.21428571428571427
------
Fraction of cls in sets, relative to all cl drug pairs, after mising values are removed
train fraction 0.6915622389306599, test fraction 0.20192147034252297
Fraction of cls in sets, relative to all clsbefore mising values are removed
train fraction 0.4444444444444444, test fraction 0.5555555555555556
------
Fraction of cls in sets, relative to all cl drug pairs, after mising values are removed
train fraction 0.40896686159844053, test fraction 0.5333333333333333


In [39]:
train_dls = utils.into_dls([np.expand_dims(xo_train_phos, 1), xd_train, 
                            np.expand_dims(y_train, 1)])
test_dls = utils.into_dls([np.expand_dims(xo_test_phos, 1), xd_test, 
                           np.expand_dims(y_test, 1)], 
                          batch_size=len(y_test))
val_dls = utils.into_dls([np.expand_dims(xo_val_phos, 1), xd_val, 
                         np.expand_dims(y_val, 1)], 
                         batch_size=len(y_val))

In [16]:
from torch_geometric.data import batch 

In [35]:
tgd

<module 'torch_geometric.data' from '/data/home/wpw035/.conda/envs/pytorch1/lib/python3.10/site-packages/torch_geometric/data/__init__.py'>

In [17]:
tgd.Batch()

DataBatch()

In [28]:
#dict that maps drug cl pair to graph rep (has lots of repeats)
pairs_to_graphs = {}
for pair in pairs_with_truth_vals:
    d = pair.split('::')[1]
    y = y_list.loc[pair].astype(np.float32)
    y = np.expand_dims(y, -1)
    graph = tgd.Data.clone(inp_ppr.dtg[d])
    graph.y = torch.tensor(y)
    pairs_to_graphs[pair] = graph

#map train and testing pairs to graphs in torch geo list objects 
train_graphs = tgd.Batch().from_data_list(
    [pairs_to_graphs[pair] for pair in train_pairs])
#test_graphs = tgd.Batch().from_data_list(
    #[pairs_to_graphs[pair] for pair in test_pairs])
#val_graphs = tgd.Batch().from_data_list(
##[pairs_to_graphs[pair] for pair in val_pairs])

In [40]:
train_graph_dls = utils.into_dls([np.expand_dims(xo_train_phos, 1), 
                                  train_graphs, 
                                  np.expand_dims(y_train, 1)])

In [41]:
train_graph_dls

[<torch.utils.data.dataloader.DataLoader at 0x2ab80a8b1ff0>,
 <torch_geometric.loader.dataloader.DataLoader at 0x2ab80a8b0d90>,
 <torch.utils.data.dataloader.DataLoader at 0x2ab80a8b0af0>]

In [36]:
type(train_graphs)

torch_geometric.data.batch.DataDataBatch

In [37]:
type(train_graphs) == tgd.batch.DataDataBatch

True

In [7]:

'''
#get one-hot enconidg of smiles for test train set.
def create_smiles_hot(drugs, drug_to_econding_dict=inp_ppr.dths):
    #Ceate x data for drugs using smiles one-hot enconding


    x_drug_final = []
    for drug in drugs:
        x_drug_final.append(drug_to_econding_dict[drug])

    x_drug_final = np.dstack(x_drug_final)
    x_drug_final = np.rollaxis(x_drug_final, -1)

    return x_drug_final

xd_smile_train = create_smiles_hot(
    [cl_d.split('::')[1] for cl_d in train_pairs]).astype('float32')
xd_smile_test = create_smiles_hot(
    [cl_d.split('::')[1] for cl_d in test_pairs]).astype('float32')
xd_smile_val = create_smiles_hot(
    [cl_d.split('::')[1] for cl_d in val_pairs]).astype('float32')

xd_smile_train = np.swapaxes(xd_smile_train, 1, 2)
xd_smile_test = np.swapaxes(xd_smile_test, 1, 2)
xd_smile_val = np.swapaxes(xd_smile_val, 1, 2)


#dict that maps drug cl pair to graph rep (has lots of repeats)
pairs_to_graphs = {}
for pair in pairs_with_truth_vals:
    d = pair.split('::')[1]
    y = y_list.loc[pair].astype(np.float32)
    y = np.expand_dims(y, -1)
    graph = tgd.Data.clone(drugs_to_graphs[d])
    graph.y = torch.tensor(y)
    pairs_to_graphs[pair] = graph

#map train and testing pairs to graphs in torch geo list objects 
train_graphs = tgd.Batch().from_data_list(
    [pairs_to_graphs[pair] for pair in train_pairs])
#test_graphs = tgd.Batch().from_data_list(
    #[pairs_to_graphs[pair] for pair in test_pairs])
#val_graphs = tgd.Batch().from_data_list(
##[pairs_to_graphs[pair] for pair in val_pairs])
'''

"\n#get one-hot enconidg of smiles for test train set.\ndef create_smiles_hot(drugs, drug_to_econding_dict=inp_ppr.dths):\n    #Ceate x data for drugs using smiles one-hot enconding\n\n\n    x_drug_final = []\n    for drug in drugs:\n        x_drug_final.append(drug_to_econding_dict[drug])\n\n    x_drug_final = np.dstack(x_drug_final)\n    x_drug_final = np.rollaxis(x_drug_final, -1)\n\n    return x_drug_final\n\nxd_smile_train = create_smiles_hot(\n    [cl_d.split('::')[1] for cl_d in train_pairs]).astype('float32')\nxd_smile_test = create_smiles_hot(\n    [cl_d.split('::')[1] for cl_d in test_pairs]).astype('float32')\nxd_smile_val = create_smiles_hot(\n    [cl_d.split('::')[1] for cl_d in val_pairs]).astype('float32')\n\nxd_smile_train = np.swapaxes(xd_smile_train, 1, 2)\nxd_smile_test = np.swapaxes(xd_smile_test, 1, 2)\nxd_smile_val = np.swapaxes(xd_smile_val, 1, 2)\n\n\n#dict that maps drug cl pair to graph rep (has lots of repeats)\npairs_to_graphs = {}\nfor pair in pairs_with_tr