In [1]:
import torch as tc 
import pandas as pd
import numpy as np
from NN import Interaction_Model
from data import MyDataSet
from sklearn.metrics import auc
from sklearn import metrics
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
import torch.nn as nn
from tqdm import tqdm
from scipy.stats import spearmanr
from sklearn.preprocessing import StandardScaler




In [2]:
fold=3
ds = MyDataSet(nsplits = 5)
ds.change_fold(fold, 'train')

19193 molecular features, 476 celltypes, 2659 drug_features
1135drugs and 476cell lines


In [3]:
ds.cell_line.shape

(476,)

In [4]:
sanger_results = pd.read_csv('./data/Drug_sensitivity_AUC_(Sanger_GDSC2).csv').melt(id_vars = 'Unnamed: 0', var_name= 'drug', value_name = 'auc')
sanger_results = sanger_results.rename(columns={'Unnamed: 0': 'cell_line'}).dropna()
sanger_results['drug'] = sanger_results['drug'].str.split(' \(').str[0]#.astype('double')

sanger_results

Unnamed: 0,cell_line,drug,auc
0,ACH-000001,CAMPTOTHECIN,0.967187
1,ACH-000002,CAMPTOTHECIN,0.711866
2,ACH-000004,CAMPTOTHECIN,0.742350
3,ACH-000006,CAMPTOTHECIN,0.752250
4,ACH-000007,CAMPTOTHECIN,0.845466
...,...,...,...
137958,ACH-000930,JQ1-(+),0.830671
137960,ACH-000934,JQ1-(+),0.736601
138016,ACH-001065,JQ1-(+),0.585800
138152,ACH-002163,JQ1-(+),0.878883


In [5]:
sanger_results_filtered = sanger_results[np.isin(sanger_results['drug'], ds.drug_embeddings.columns)]
sanger_results_filtered = sanger_results_filtered[~np.isin(sanger_results_filtered['cell_line'], ds.cell_line)]
sanger_results_filtered

In [6]:
rna_data = pd.read_csv('./data/OmicsExpressionProteinCodingGenesTPMLogp1.csv', index_col = 0)
rna_data.columns = rna_data.columns.str.split(' \(').str[0]#.astype('double')
sanger_cell_lines = np.array(sanger_results_filtered['cell_line'].drop_duplicates())
#sanger_cell_lines
rna_data

In [7]:
rna_data_filtered = sanger_results_filtered[['cell_line']].drop_duplicates().merge(rna_data, left_on= 'cell_line', right_index=True, how='inner')
rna_data_filtered.index = rna_data_filtered['cell_line']
rna_data_filtered = rna_data_filtered.drop('cell_line', axis=1)
names = ds.molecular_names
#print(names)
selected_RNA = rna_data_filtered.loc[:,np.array(names)]

#meanv, stdv = selected_RNA.mean(axis=0),selected_RNA.std(axis=0)
selected_RNA.to_csv('./results/data/sanger_RNA.csv')

meanv, stdv = ds.scaler.mean_, np.sqrt(ds.scaler.var_)
selected_RNA = (selected_RNA-meanv) / (stdv+0.1)

selected_RNA



In [8]:
sanger_results_filtered = sanger_results_filtered[np.isin(sanger_results_filtered['cell_line'], selected_RNA.index)]
sanger_results_filtered['auc_per_drug'] = sanger_results_filtered.groupby('drug')['auc'].transform(lambda x: (x - x.mean()) / x.std())



In [9]:
sanger_results_filtered.to_csv('./results/data/sanger_results.csv')
sanger_results_filtered

In [10]:
drugs = sanger_results_filtered['drug'].drop_duplicates().to_numpy()
drugs

In [11]:
def load_model(fold):
    model = Interaction_Model(ds).eval()
    model.load_state_dict(tc.load('./results/model_params/model_params_fold'+str(fold)+'.pt'))
    return model

    
model = load_model(fold) #[load_model(fold).eval() for fold in [0,1,2,3,4]]

In [12]:
sanger_RNA_tensor = tc.tensor(selected_RNA.to_numpy()).float()
drug_tensor = tc.stack([ds.get_drug_vector(drug) for drug in drugs],axis=0)

In [13]:
latent_tumor = model.nn2.forward(sanger_RNA_tensor)
latent_drug = model.nn1.forward(drug_tensor)

In [14]:
res = latent_tumor @ latent_drug.t()
res_frame = pd.DataFrame(res.detach().numpy(), index = selected_RNA.index, columns = drugs)
res_frame['cell_line'] = res_frame.index
res_frame_long = res_frame.melt(id_vars = 'cell_line', var_name= 'drug', value_name = 'prediction')

In [15]:
diesdas = sanger_results_filtered.merge(res_frame_long)
diesdas['fold'] = fold
diesdas.to_csv('./results/data/sanger_results'+str(fold)+'.csv')

In [16]:
res_frame.shape

In [17]:
##############
#
##############