In [1]:
import os
from os.path import join as oj
import pandas as pd
import numpy as np
from scipy.stats import sem
from collections import defaultdict

from utils.utils import cwd, set_up_plotting

plt = set_up_plotting()

In [2]:
from drug_utils import load_process_DAVIS, X_drug, X_target, y, kernel

Beginning Processing...
Default binary threshold for the binding affinity scores are 30, you can adjust it by using the "threshold" parameter
Done!
Setting the target site to be the amino acid with sequence "PFWKILNPLLERGTYYYFMGQQPGKVLGDQRRPSLPALHFIKGAGKKESSRHGGPHCNVFVEHEALQRPVASDFEPQGLSEAARWNSKENLLAGPSENDPNLFVALYDFVASGDNTLSITKGEKLRVLGYNHNGEWCEAQTKNGQGWVPSNYITPVNSLEKHSWYHGPVSRNAAEYLLSSGINGSFLVRESESSPGQRSISLRYEGRVYHYRINTASDGKLYVSSESRFNTLAELVHHHSTVADGLITTLHYPAPKRNKPTVYGVSPNYDKWEMERTDITMKHKLGGGQYGEVYEGVWKKYSLTVAVKTLKEDTMEVEEFLKEAAVMKEIKHPNLVQLLGVCTREPPFYIITEFMTYGNLLDYLRECNRQEVNAVVLLYMATQISSAMEYLEKKNFIHRDLAARNCLVGENHLVKVADFGLSRLMTGDTYTAHAGAKFPIKWTAPESLAYNKFSIKSDVWAFGVLLWEIATYGMSPYPGIDLSQVYELLEKDYRMERPEGCPEKVYELMRACWQWNPSDRPSFAEIHQAFETMFQESSISDEVEKELGKQGVRGAVSTLLQAPELPTKTRTSRRAAEHRDTTDVPEMPHSKGQGESDPLDHEPAVSPLLPRKERGPPEGGLNEDERLLPKDKKTNLFSALIKKKKKTAPTPPKRSSSFREMDGQPERRGAGEEEGRDISNGALAFTPLDTADPAKSPKPSNGAGVPNGALRESGGSGFRSPHLWKKSSTLTSSRLATGEEEGGGSSSKRFLRSCSASCVPHGAKDTEWRSVTLPRDLQSTGRQFDSSTFGGH

In [3]:
setting = 'drug'
results_dir = 'results'

with cwd(oj(results_dir, setting)):
    result_datas = []
    for file in os.listdir():
        if file.endswith('.npz'):
            result_datas.append(np.load(file, allow_pickle=True))

### GP regression

In [4]:
def posterior_predictive(X_train, y_train, X_test, sigma_y=1e-8):
    K = kernel(X_train, X_train) + np.square(sigma_y) * np.eye(len(X_train))
    K_s = kernel(X_train, X_test)
#     K_ss = kernel(X_train, X_train)
    K_inv = np.linalg.inv(K + sigma_y * np.eye(len(K)))

    mu_s = K_s.T @ K_inv @ y_train
    return mu_s

def get_mse(X_drug, y, X_train, X_test):
    X_train = np.unique(X_train)
    X_test = np.unique(X_test)
    
    y_train = [y_i for x_drug_i, y_i in zip(X_drug, y) if x_drug_i in X_train ]
    y_train = np.asarray(y_train)
    
    y_test = [ y_i for x_drug_i, y_i in zip(X_drug, y) if x_drug_i in X_test]
    y_test = np.asarray(y_test)

    predictive_mean = posterior_predictive(X_train, y_train, X_test, 1e-1)
    
    return np.mean((y_test - predictive_mean)**2)

In [5]:
obs = result_datas[0]['obs'].item()
Ts = result_datas[0]['Ts']

In [6]:

mse_results = defaultdict(list)
n = 3
n_trials = 5
for trial_i in range(n_trials):
    obs = result_datas[trial_i]['obs'].item()
    Ts = result_datas[trial_i]['Ts']
    for collab_type, collab_obs in obs.items():
        mse = 0
        mses = []
        for i in range(n):
            if 'indiv' in collab_type:    
                mse_i = get_mse(X_drug, y, collab_obs[i], Ts[i])
            else:
                mse_i = get_mse(X_drug, y, collab_obs, Ts[i]) 

            mses.append(mse_i)
            mse += mse_i / n
        
        mse_results[collab_type+'-avg-mses'].append(mse)        
        mse_results[collab_type+'-mses'].append(mses)

In [7]:
import pandas as pd

In [8]:
data_df = defaultdict(list)
for collab_type, mse_list in mse_results.items():
    baseline = collab_type.replace('-avg-mses', '').replace('-mses', '').replace('_obs','').replace('indiv_greedy', 'ind')
    if baseline not in data_df['Baselines']:
        data_df['Baselines'].append(baseline)
    if '-avg-mses' in collab_type:
        avg = np.mean(mse_list)
        se = sem(mse_list)
        data_df['Avg MSE'].append(avg)
        data_df['Stderr'].append(se)
    else:
        stds = np.std(mse_list, axis=1)
        
        mean_std_mse = np.mean(stds)
        se_std_mse = sem(stds)
        data_df['Std MSE'].append(mean_std_mse)
        data_df['Stderr Std'].append(se_std_mse)

In [9]:
drug_regression_df = pd.DataFrame(data=data_df)

In [10]:
with cwd(oj(results_dir, setting)):
    drug_regression_df.to_latex('regression_results.tex',index=False)
    drug_regression_df.to_csv('regression_results.csv',index=False)
