In [47]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.feature_selection import VarianceThreshold
import random
import pickle
from sklearn.preprocessing import StandardScaler
import os
import tensorflow as tf
from tqdm.notebook import tqdm
from scipy.stats import multivariate_normal as mvn
import matplotlib.pyplot as plt
from scipy.linalg import block_diag
import warnings
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

In [48]:
warnings.filterwarnings('ignore')

In [49]:
def get_targets_with_weights(batch_data, initial_ensembles, size_ens): 
    
    target_dim = 1
    
    # weights_ann_1 = ann.get_weights()
    
    # h1  = ann.layers[1].output.shape[-1]

    n_hidden_1 = len(weights_ann_1[0].ravel())
    
    hidden_weights_1 = initial_ensembles[:,:n_hidden_1].reshape( size_ens, batch_data.shape[1], h1)
    
    
    hidden_output_1 = np.einsum('ij,kjl->kil', batch_data, hidden_weights_1)

    
    hidden_layer_bias_1 = initial_ensembles[:,n_hidden_1:(n_hidden_1 + h1)].reshape(size_ens, 1,  h1)


    hidden_output_1 = hidden_output_1 + hidden_layer_bias_1

    n_pred_weights_1 = len(weights_ann_1[2].ravel())

    output_weights_1 = initial_ensembles[:,(n_hidden_1 + h1):(n_hidden_1 + h1 + n_pred_weights_1) ].reshape(size_ens, h1, target_dim)


    output_1 = np.einsum('ijk,ikl->ijl', hidden_output_1, output_weights_1)


    output_layer_bias_1 = initial_ensembles[:,(n_hidden_1 + h1 + n_pred_weights_1):(n_hidden_1 + h1 + n_pred_weights_1 + target_dim)].reshape(size_ens, 1, target_dim)


    final_output_1 = output_1 + output_layer_bias_1
    
    final_output_1 = final_output_1[:,:, 0]
    
    # print(final_output_1.shape, initial_ensembles.shape)
    
    stack = np.hstack((final_output_1, initial_ensembles))

    
    return final_output_1, stack

In [50]:
def ann(hidden = 32, input_shape = 256, output_shape = 1): 
    input_layer = tf.keras.layers.Input(shape = (input_shape))
    hidden_layer = tf.keras.layers.Dense(hidden)
    hidden_output = hidden_layer(input_layer)
    pred_layer = tf.keras.layers.Dense(output_shape, activation = "relu")
    pred_output = pred_layer(hidden_output)
#     pred_output = tf.keras.layers.Activation("softmax")(pred_output)
    model = tf.keras.models.Model(input_layer, pred_output)
    return model

In [51]:
def generate_initial_ensembles(num_weights, lambda1, size_ens):
    mean_vec = np.zeros((num_weights,))
    cov_matrix = lambda1*np.identity(num_weights)
    mvn_samp = mvn(mean_vec, cov_matrix)
    return mvn_samp.rvs(size_ens)

In [52]:
def expit(x):
    """Compute softmax values for each sets of scores in x."""
#     e_x = np.exp(x - np.max(x))
    return 1 / (1 + np.exp(-x))

In [53]:
samp_ann =  ann(hidden = 16, input_shape = 32, output_shape = 1)

In [54]:
weights_ann_1 = samp_ann.get_weights()

In [55]:
h1  = samp_ann.layers[1].output.shape[-1]

In [56]:
samp_ann.count_params()

545

In [57]:
hidden_neurons = h1

In [58]:
samp_ann_params = samp_ann.count_params()

In [59]:
def get_initial_X_t(data1, data2, size_ens, var_weights = 1.0, var_weight_weights = 4.0):
    # samp_ann =  ann(hidden = hidden_neurons, input_shape = 32, output_shape = 1)
    
    initial_ensembles1 = generate_initial_ensembles(samp_ann_params, var_weights, size_ens)
    data1_out1, data1_stack1 = get_targets_with_weights(data1, initial_ensembles1, size_ens = size_ens)
    
    initial_ensembles2 = generate_initial_ensembles(samp_ann_params, var_weights, size_ens)
    data1_out2, data1_stack2 = get_targets_with_weights(data1, initial_ensembles2, size_ens = size_ens)
    
    initial_ensembles3 = generate_initial_ensembles(samp_ann_params, var_weights, size_ens)
    data2_out1, data2_stack1 = get_targets_with_weights(data2, initial_ensembles3, size_ens = size_ens)
    
    initial_ensembles4 = generate_initial_ensembles(samp_ann_params, var_weights, size_ens)
    data2_out2, data2_stack2 = get_targets_with_weights(data2, initial_ensembles4, size_ens = size_ens)   
    
    X_t = np.concatenate((np.expand_dims(data1_stack1, -1), np.expand_dims(data1_stack2, -1), 
                         np.expand_dims(data2_stack1, -1), np.expand_dims(data2_stack2, -1)), axis = -1)
    
    initial_ensembles_for_weights = generate_initial_ensembles(4, var_weight_weights, size_ens)
    initial_ensembles_for_weights = np.expand_dims(initial_ensembles_for_weights,1)
    
    initial_ensembles_for_L = generate_initial_ensembles(4, var_weights, size_ens)
    initial_ensembles_for_L = np.expand_dims(initial_ensembles_for_L,1)    
    
    initial_ensembles_for_D1 = generate_initial_ensembles(1, var_weights, size_ens).reshape(-1,1)
    initial_ensembles_for_D2 = generate_initial_ensembles(1, var_weights, size_ens).reshape(-1,1)
    
    initial_ensembles_for_D1_zero = np.zeros((size_ens,1,1)).reshape(-1,1)
    initial_ensembles_for_D2_zero = np.zeros((size_ens,1,1)).reshape(-1,1)
    
    initial_ensembles_for_D = np.concatenate((np.expand_dims(initial_ensembles_for_D1,1),
                                                       np.expand_dims(initial_ensembles_for_D1_zero,1), 
                                                      np.expand_dims(initial_ensembles_for_D2,1),
                                                       np.expand_dims(initial_ensembles_for_D2_zero,1)), axis = 2)
    
    # print(X_t.shape, initial_ensembles_for_weights.shape)
    
    X_t = np.concatenate((X_t, initial_ensembles_for_weights, initial_ensembles_for_L, initial_ensembles_for_D), axis = 1)
    
    initial_ensembles = np.hstack((initial_ensembles1, initial_ensembles2, initial_ensembles3, initial_ensembles4))
    
    return X_t, initial_ensembles, initial_ensembles_for_weights[:,0,:], initial_ensembles_for_L[:,0,:], initial_ensembles_for_D[:,0,:]

In [60]:
def get_weighted_targets_with_weights(batch_data, initial_ensembles, size_ens, weights): 
    
    target_dim = 1
    

    n_hidden_1 = len(weights_ann_1[0].ravel())
    
    hidden_weights_1 = initial_ensembles[:,:n_hidden_1].reshape( size_ens, batch_data.shape[1], h1)
    
    
    hidden_output_1 = np.einsum('ij,kjl->kil', batch_data, hidden_weights_1)

    
    hidden_layer_bias_1 = initial_ensembles[:,n_hidden_1:(n_hidden_1 + h1)].reshape(size_ens, 1,  h1)


    hidden_output_1 = hidden_output_1 + hidden_layer_bias_1

    n_pred_weights_1 = len(weights_ann_1[2].ravel())

    output_weights_1 = initial_ensembles[:,(n_hidden_1 + h1):(n_hidden_1 + h1 + n_pred_weights_1) ].reshape(size_ens, h1, target_dim)


    output_1 = np.einsum('ijk,ikl->ijl', hidden_output_1, output_weights_1)


    output_layer_bias_1 = initial_ensembles[:,(n_hidden_1 + h1 + n_pred_weights_1):(n_hidden_1 + h1 + n_pred_weights_1 + target_dim)].reshape(size_ens, 1, target_dim)


    final_output_1 = output_1 + output_layer_bias_1
    
    final_output_1 = final_output_1[:,:, 0]
    
    final_output_1 = final_output_1*weights
    
    # print(final_output_1.shape, initial_ensembles.shape)
    
    stack = np.hstack((final_output_1, initial_ensembles))

    
    return final_output_1, stack

In [61]:
std_targets = pickle.load( open('..//Data//target_scaler.pkl', 'rb'))

In [62]:
# R_t = np.array([[0.02, 0], [0, 0.02]])

In [63]:
# var1 = R_t[0,0]
# var2 = R_t[1,1]
# cov = R_t[1,0]

In [64]:
from scipy.stats import beta

In [65]:
fudging_beta = beta(1,9)

In [66]:
def forward_operation(data1, data2, combined_ensembles , size_ens, fudging_beta):
    # samp_ann =  ann(hidden = hidden_neurons, input_shape = 32, output_shape = 1)
    params = samp_ann_params
    initial_ensembles1 = combined_ensembles[:, :params]
    initial_ensembles2 = combined_ensembles[:, params:(2*params)]
    initial_ensembles3 = combined_ensembles[:, (2*params):(3*params)]
    initial_ensembles4 = combined_ensembles[:, (3*params):(4*params)]

    
    initial_ensembles_for_weights = combined_ensembles[:, (4*params):(4*params + 4)]
    
    initial_ensembles_for_L = combined_ensembles[:, (4*params + 4):(4*params + 4 + 4)]
    
    initial_ensembles_for_D = combined_ensembles[:,(4*params + 4 + 4):(4*params + 4 + 4 + 4)]
    
    
    softmax_weights = tf.math.softmax(initial_ensembles_for_weights).numpy()
    
    model_1 = softmax_weights[:, :2].sum(1).reshape(-1,1) +  fudging_beta.rvs(size_ens).reshape(-1,1)
    
    # model_1 = np.min(model_1 -fudging_factor)
    
    model_2 = softmax_weights[:, 2:].sum(1).reshape(-1,1) +  fudging_beta.rvs(size_ens).reshape(-1,1)
    
    
    model_1_plus_model_2 = model_1 + model_2
    
    model_1 = model_1/model_1_plus_model_2
    
    model_2 = model_2/model_1_plus_model_2
    
    
    # print(np.mean(model_1 + model_2))
    
    data1_out1, data1_stack1 = get_weighted_targets_with_weights(data1, initial_ensembles1, size_ens = size_ens,
                                                                  weights=model_1)
    
    data1_out2, data1_stack2 = get_weighted_targets_with_weights(data1, initial_ensembles2, size_ens = size_ens,
                                                                weights=model_1)
    
    data2_out1, data2_stack1 = get_weighted_targets_with_weights(data2, initial_ensembles3, size_ens = size_ens,
                                                                 weights=model_2)
    
    data2_out2, data2_stack2 = get_weighted_targets_with_weights(data2, initial_ensembles4, size_ens = size_ens,
                                                                  weights=model_2)   
    
    X_t = np.concatenate((np.expand_dims(data1_stack1, -1), np.expand_dims(data1_stack2, -1), 
                         np.expand_dims(data2_stack1, -1), np.expand_dims(data2_stack2, -1)), axis = -1)
    
    initial_ensembles = np.hstack((initial_ensembles1, initial_ensembles2, initial_ensembles3, initial_ensembles4, 
                        initial_ensembles_for_weights, initial_ensembles_for_L, initial_ensembles_for_D))
    
    # print(X_t.shape)
    
    initial_ensembles_for_weights = np.expand_dims(initial_ensembles_for_weights,1)
    
    initial_ensembles_for_L = np.expand_dims(initial_ensembles_for_L,1)
    
    initial_ensembles_for_D = np.expand_dims(initial_ensembles_for_D,1)
    
    # print(initial_ensembles_for_weights.shape)
    
    X_t = np.concatenate((X_t, initial_ensembles_for_weights, initial_ensembles_for_L, initial_ensembles_for_D), axis = 1)
    
    weighted_alogp = data1_out1 + data2_out1
    
    weighted_psa = data1_out2 + data2_out2
    
    return X_t, initial_ensembles, weighted_alogp, weighted_psa, model_1, model_2

In [67]:
def forward_operation_test(data1, data2, combined_ensembles , size_ens):
    # samp_ann =  ann(hidden = hidden_neurons, input_shape = 32, output_shape = 1)
    params = samp_ann_params
    initial_ensembles1 = combined_ensembles[:, :params]
    initial_ensembles2 = combined_ensembles[:, params:(2*params)]
    initial_ensembles3 = combined_ensembles[:, (2*params):(3*params)]
    initial_ensembles4 = combined_ensembles[:, (3*params):(4*params)]

    
    initial_ensembles_for_weights = combined_ensembles[:, (4*params):(4*params + 4)]
    
    initial_ensembles_for_L = combined_ensembles[:, (4*params + 4):(4*params + 4 + 4)]
    
    initial_ensembles_for_D = combined_ensembles[:,(4*params + 4 + 4):(4*params + 4 + 4 + 4)]
    
    
    softmax_weights = tf.math.softmax(initial_ensembles_for_weights).numpy()
    
    model_1 = softmax_weights[:, :2].sum(1).reshape(-1,1) 
    
    # model_1 = np.min(model_1 -fudging_factor)
    
    model_2 = softmax_weights[:, 2:].sum(1).reshape(-1,1) 
    
    
#     model_1_plus_model_2 = model_1 + model_2
    
#     model_1 = model_1/model_1_plus_model_2
    
#     model_2 = model_2/model_1_plus_model_2
    
    
    # print(np.mean(model_1 + model_2))
    
    data1_out1, data1_stack1 = get_weighted_targets_with_weights(data1, initial_ensembles1, size_ens = size_ens,
                                                                  weights=model_1)
    
    data1_out2, data1_stack2 = get_weighted_targets_with_weights(data1, initial_ensembles2, size_ens = size_ens,
                                                                weights=model_1)
    
    data2_out1, data2_stack1 = get_weighted_targets_with_weights(data2, initial_ensembles3, size_ens = size_ens,
                                                                 weights=model_2)
    
    data2_out2, data2_stack2 = get_weighted_targets_with_weights(data2, initial_ensembles4, size_ens = size_ens,
                                                                  weights=model_2)   
    
    X_t = np.concatenate((np.expand_dims(data1_stack1, -1), np.expand_dims(data1_stack2, -1), 
                         np.expand_dims(data2_stack1, -1), np.expand_dims(data2_stack2, -1)), axis = -1)
    
    initial_ensembles = np.hstack((initial_ensembles1, initial_ensembles2, initial_ensembles3, initial_ensembles4, 
                        initial_ensembles_for_weights, initial_ensembles_for_L, initial_ensembles_for_D))
    
    # print(X_t.shape)
    
    initial_ensembles_for_weights = np.expand_dims(initial_ensembles_for_weights,1)
    
    initial_ensembles_for_L = np.expand_dims(initial_ensembles_for_L,1)
    
    initial_ensembles_for_D = np.expand_dims(initial_ensembles_for_D,1)
    
    # print(initial_ensembles_for_weights.shape)
    
    X_t = np.concatenate((X_t, initial_ensembles_for_weights, initial_ensembles_for_L, initial_ensembles_for_D), axis = 1)
    
    weighted_alogp = data1_out1 + data2_out1
    
    weighted_psa = data1_out2 + data2_out2
    
    return X_t, initial_ensembles, weighted_alogp, weighted_psa, model_1, model_2

In [68]:
# samp_ann =  ann(hidden = 16, input_shape = 32, output_shape = 1)

In [69]:
total_weights = 4*(samp_ann.count_params() + 1 + 1 + 1)

In [70]:
reduction = 10

In [71]:
size_ens = total_weights//reduction

In [72]:
size_ens

219

In [73]:
G_t = [[1, 0, 1, 0], [0, 1, 0, 1]]
G_t = np.array(G_t).T

In [74]:
def get_predictions(data1, data2, initial_ensembles, fudging_beta  =fudging_beta): 
    _,_, weighted_alogp, weighted_psa, w1, w2 = forward_operation(data1, data2, initial_ensembles, size_ens = size_ens, fudging_beta = fudging_beta)
    weighted_alogp = np.expand_dims(weighted_alogp,-1)
    weighted_psa = np.expand_dims(weighted_psa,-1)
    preds = np.concatenate((weighted_alogp, weighted_psa),-1)
    return preds, w1, w2

In [75]:
def get_predictions_test(data1, data2, initial_ensembles): 
    _,_, weighted_alogp, weighted_psa, w1, w2 = forward_operation_test(data1, data2, initial_ensembles, size_ens = size_ens)
    weighted_alogp = np.expand_dims(weighted_alogp,-1)
    weighted_psa = np.expand_dims(weighted_psa,-1)
    preds = np.concatenate((weighted_alogp, weighted_psa),-1)
    return preds, w1, w2

In [76]:
def calculate_mu_bar_G_bar(data1, data2, initial_ensembles, fudging_beta):
    H_t = np.hstack((np.identity(data1.shape[0]), np.zeros((data1.shape[0], samp_ann_params + 1 + 1 + 1))))
    mu_bar = initial_ensembles.mean(0)
    X_t,_, _, _, _, _ = forward_operation(data1, data2, initial_ensembles, size_ens = size_ens, fudging_beta = fudging_beta)
    X_t = X_t.transpose((0,2,1))
    X_t = X_t.reshape(X_t.shape[0], X_t.shape[1]*X_t.shape[2])
    script_H_t = np.kron(G_t.T, H_t)
    G_u = (script_H_t@X_t.T)
    G_u = G_u.T
    G_bar = (G_u.mean(0)).ravel()
    return mu_bar.reshape(-1,1), G_bar.reshape(-1,1), G_u

In [77]:
def calculate_C_u(initial_ensembles, mu_bar, G_bar, G_u): 
    u_j_minus_u_bar = initial_ensembles - mu_bar.reshape(1,-1)
    G_u_minus_G_bar = G_u -  G_bar.reshape(1,-1)
    c = np.zeros((total_weights, G_bar.shape[0]))
    for i in range(0, size_ens): 
        c += np.kron(u_j_minus_u_bar[i, :].T.reshape(-1,1), G_u_minus_G_bar[i,:].reshape(-1,1).T)
    return c/size_ens, G_u_minus_G_bar

In [78]:
def calculate_D_u( G_bar, G_u): 
    G_u_minus_G_bar = G_u -  G_bar.reshape(1,-1)
    d = np.zeros((G_bar.shape[0], G_bar.shape[0]))
    for i in range(0, size_ens): 
        d += np.kron(G_u_minus_G_bar[i,:].T.reshape(-1,1), G_u_minus_G_bar[i,:].reshape(-1,1).T)
    return d/size_ens

In [79]:
def get_updated_ensemble(data1, data2, initial_ensembles, y_train, size_ens = size_ens, inflation_factor = 1.0, fudging_beta = fudging_beta):
    mu_bar, G_bar, G_u = calculate_mu_bar_G_bar(data1, data2, initial_ensembles, fudging_beta)
    C, G_u_minus_G_bar = calculate_C_u(initial_ensembles, mu_bar, G_bar, G_u)
    D = calculate_D_u( G_bar, G_u)
    _, R_t = create_cov(data1.shape[0],initial_ensembles)
    inflation = np.identity(R_t.shape[0])*inflation_factor
    D_plus_cov = D + (R_t *inflation_factor)
    D_plus_cov_inv = np.linalg.inv(D_plus_cov)
    mid_quant = C@D_plus_cov_inv
    noise_vec_mean = np.zeros((R_t.shape[0], ))
    noise_mvn = mvn(noise_vec_mean, R_t)
    fudging = noise_mvn.rvs(size_ens)
    interim = (y_train.T.flatten().reshape(1,-1) + fudging)
    right_quant = interim - G_u
    mid_times_right = mid_quant@right_quant.T
    updated_ensemble = (initial_ensembles + mid_times_right.T)
    return updated_ensemble

In [80]:
target_dim = 2

In [81]:
lambda_D = 1

In [82]:
def inverse_transform(data, idx):
    data_cur = data[idx, :, :]
    inv_data_cur = std_targets.inverse_transform(data_cur)
    return inv_data_cur

In [83]:
from joblib import Parallel, delayed

In [84]:
def create_cov(shape, initial_ensembles):
    cov_part = initial_ensembles[:, -8:-4]
    cov_part = cov_part.mean(0)
    # variances = tf.math.softplus(cov_part[:2]).numpy()
    variances = cov_part[:2]
    covariances = cov_part[2:]
    base_cov = np.identity(target_dim)
    base_cov[0,0] = variances[0]
    base_cov[1,1] = variances[1]
    base_cov[0,1] = covariances[0]
    base_cov[1,0] = covariances[1]
    
    variances1 = tf.math.softplus(initial_ensembles[:, -4:]).numpy()
    variances1 = variances1.mean(0)
    base_variances = np.identity(target_dim)
    base_variances[0,0] = variances1[0]
    base_variances[1,1] = variances1[2]
    
    final = np.linalg.cholesky(base_cov@base_cov.T + base_variances)
    cov_mat = final@final.T
    cov_mat_final = cov_mat
    # cov_mat_final = cov_mat@cov_mat.T
    
    if is_pos_def(cov_mat_final) != True:
        print("resulting cov matrix is not positive semi definite")
        pass
    
    # print(np.linalg.det(cov_mat_final))
    
    var1 = cov_mat_final[0,0]
    var2 = cov_mat_final[1,1]
    cov = cov_mat_final[1,0]

    n = shape
    
    ul = var1*np.identity(n)
    lr = var2*np.identity(n)
    ur = cov*np.identity(n)
    ll = ur.T    
    
    first_row = np.hstack((ul, ur))
    second_row = np.hstack((ll, lr))
    
    R_t = np.vstack((first_row, second_row))
    
    return cov_mat_final, R_t
    

In [85]:
def is_pos_def(x):
    return np.all(np.linalg.eigvals(x) > 0)

In [86]:
with open("..//Data//smiles_to_rdkit_70_30_with_cov_minus_0.27_var.pickle", "rb") as f: 
    catch = pickle.load(f)

In [87]:
# idx = 1

In [88]:
def prepare_data(idx, var_weights = 1.0, var_weight_weights = 4.0): 
    catch_idx = catch[idx]
    x_train, x_valid, y_train, y_valid = catch_idx[0], catch_idx[1], catch_idx[2], catch_idx[3]
    y_train_actual, y_train = y_train[:,:2], y_train[:,2:]
    y_valid_actual, y_valid = y_valid[:,:2], y_valid[:,2:]
    smiles_feats_train = x_train[:, :32]
    rdkit_feats_train = x_train[:, 32:]
    smiles_feats_valid = x_valid[:, :32]
    rdkit_feats_valid = x_valid[:, 32:]

    X_t, initial_ensembles, initial_ensembles_for_weights, initial_ensembles_for_L, initial_ensembles_for_D = get_initial_X_t(smiles_feats_train, rdkit_feats_train, size_ens = size_ens, var_weights = var_weights, var_weight_weights = var_weight_weights)
    initial_ensembles = np.hstack((initial_ensembles, initial_ensembles_for_weights, initial_ensembles_for_L, initial_ensembles_for_D))
    
    return smiles_feats_train, rdkit_feats_train, smiles_feats_valid, rdkit_feats_valid, y_train, y_train_actual, y_valid, y_valid_actual, initial_ensembles 

In [89]:
# smiles_feats_train, rdkit_feats_train, smiles_feats_valid, rdkit_feats_valid, y_train, y_train_actual, y_valid, y_valid_actual, initial_ensembles  = prepare_data(idx)

In [90]:
# y_train

In [91]:
# y_train_actual

In [92]:
from scipy.linalg import norm

In [93]:
# beta(1,19).rvs(size_ens).mean()

In [94]:
def get_results(idx, var_weights = 1.0, var_weight_weights = 1.0, inflation_factor = 1.2, fudging_beta = beta(1,19)):
    # print('var_weights' + str(var_weights))
    # print('inflation_factor' + str(inflation_factor))
    # print('var_weight_weights' + str(var_weight_weights))
    smiles_feats_train, rdkit_feats_train, smiles_feats_valid, rdkit_feats_valid, y_train, y_train_actual, y_valid, y_valid_actual, initial_ensembles  = prepare_data(idx, var_weights = var_weights, var_weight_weights =var_weight_weights)
    # print(R_t.shape)
    best_train_width_mean = 100000
    
    for i in range(0,10000):
        # print(i)
    
        c = np.zeros((2,2))
        initial_ensembles = get_updated_ensemble(smiles_feats_train, rdkit_feats_train, initial_ensembles, y_train, size_ens, inflation_factor = inflation_factor, fudging_beta = fudging_beta)
        # print(inflation_factor)
        G_u_train, w1, w2 = get_predictions(smiles_feats_train, rdkit_feats_train, initial_ensembles, fudging_beta)

        catch = Parallel(n_jobs = 15, verbose = 0)(delayed(inverse_transform)(G_u_train, i)  for i in range(G_u_train.shape[0]))
        G_u_train = np.array(catch)
    
        y_train_cur = std_targets.inverse_transform(y_train_actual)
    
        li_train = np.percentile(G_u_train, axis = 0, q = (2.5, 97.5))[0,:,:]   
        ui_train = np.percentile(G_u_train, axis = 0, q = (2.5, 97.5))[1,:,:]
    
        width_train = ui_train - li_train
        avg_width_train = width_train.mean(0)
    
        ind_train = (y_train_cur >= li_train) & (y_train_cur <= ui_train)
        coverage_train= ind_train.mean(0)
    
        averaged_targets_train = G_u_train.mean(0)
        rmse_train = np.sqrt(((y_train_cur -averaged_targets_train)**2).mean(0))
    # print(rmse_train, coverage_train, avg_width_train)
    
        G_u_test, _, _ = get_predictions_test(smiles_feats_valid, rdkit_feats_valid, initial_ensembles)
    
        catch = Parallel(n_jobs = 15, verbose = 0)(delayed(inverse_transform)(G_u_test, i)  for i in range(G_u_test.shape[0]))
        G_u_test = np.array(catch)
    
        y_valid_cur = std_targets.inverse_transform(y_valid_actual)    
    
        li_test = np.percentile(G_u_test, axis = 0, q = (2.5, 97.5))[0,:,:]   
        ui_test = np.percentile(G_u_test, axis = 0, q = (2.5, 97.5))[1,:,:]
    
        width_test = ui_test - li_test
        avg_width_test = width_test.mean(0)
    
        ind_test = (y_valid_cur >= li_test) & (y_valid_cur <= ui_test)
        coverage_test= ind_test.mean(0)
    
        averaged_targets_test = G_u_test.mean(0)
        rmse_test = np.sqrt(((y_valid_cur -averaged_targets_test)**2).mean(0))    
    
        # weight_norms = np.array(norm(initial_ensembles, ord = 2, axis = 1))
        # weight_norm_mean.append(weight_norms.mean())
        # weight_norm_sd.append(weight_norms.std())
    
        cov_mat_final, _ = create_cov(smiles_feats_train.shape[0],initial_ensembles)
        
        # print("standardized_scale_R_t")
        # print(np.diag(cov_mat_final), cov_mat_final[0,1])
        
        # print(w1.shape)
        
        li_smiles_weight = np.percentile(w1, axis = 0, q = (2.5, 97.5))[0][0]
        
        # print(np.percentile(w1, axis = 0, q = (2.5, 97.5)))
        
        ui_smiles_weight = np.percentile(w1, axis = 0, q = (2.5, 97.5))[1][0]      
        
        # print(coverage_train.tolist(), avg_width_train.tolist(), rmse_train.tolist())
        # print(coverage_test.tolist(), avg_width_test.tolist(), rmse_test.tolist())
        # print(w1.mean(), w1.std())
        # print(li_smiles_weight, ui_smiles_weight)
        # print(avg_width_train.tolist(), coverage_train.tolist(), rmse_train.tolist(), avg_width_test.tolist(), coverage_test.tolist(), rmse_test.tolist(), w1.mean())

        if (avg_width_train.mean() < best_train_width_mean) & (coverage_train.mean() > 0.95): 
            # print("went here")
            best_train_width_mean = avg_width_train.mean()
            best_train_width = avg_width_train
            best_smiles_weight = w1.mean()
            best_coverage_train = coverage_train
            best_rmse_train = rmse_train
        
            best_test_width = avg_width_test

            best_coverage_test = coverage_test    
            best_rmse_test = rmse_test
            
            best_li_smiles_weight = li_smiles_weight
            
            best_ui_smiles_weight = ui_smiles_weight
    
        if coverage_train.mean() < 0.95:
            
            # print()
            # print(best_train_width.tolist(), best_coverage_train.tolist(), best_rmse_train.tolist(), best_test_width.tolist(), best_coverage_test.tolist(), best_rmse_test.tolist(), best_smiles_weight, flush = True)
            print("done for fold" + str(idx), flush = True)
            print("train_width" + str(best_train_width.tolist()), flush = True)
            print("test_width" + str(best_test_width.tolist()), flush = True)
            print("smiles_weight" + str(best_smiles_weight), flush = True)
            print("rmse_train" + str(best_rmse_train.tolist()), flush = True)
            print("rmse_test" + str(best_rmse_test.tolist()), flush = True)
            print("smiles_weight_ci" + str([best_li_smiles_weight, best_ui_smiles_weight]), flush = True)
            
            return [best_train_width.tolist(), best_coverage_train.tolist(), best_rmse_train.tolist(), best_test_width.tolist(), best_coverage_test.tolist(), best_rmse_test.tolist(), best_smiles_weight, best_li_smiles_weight, best_ui_smiles_weight]


In [95]:
# results_df[results_df["indicator"] == False]

In [96]:
%%time
get_results(45, var_weights = 1.0, var_weight_weights = 2.0, inflation_factor =1.0, fudging_beta = beta(1,11))

done for fold45
train_width[1.1768901684233888, 21.780142495999254]
test_width[0.7814313413999134, 15.899799019630175]
smiles_weight0.8082138288620736
rmse_train[0.22633814361003401, 4.7246524259256155]
rmse_test[0.25983027837722544, 4.832428377708499]
smiles_weight_ci[0.6882519053340528, 0.8852767902993528]
CPU times: user 3min 12s, sys: 2min 24s, total: 5min 36s
Wall time: 1min 30s


[[1.1768901684233888, 21.780142495999254],
 [0.9694019471488178, 0.9568845618915159],
 [0.22633814361003401, 4.7246524259256155],
 [0.7814313413999134, 15.899799019630175],
 [0.9208333333333333, 0.9166666666666666],
 [0.25983027837722544, 4.832428377708499],
 0.8082138288620736,
 0.6882519053340528,
 0.8852767902993528]

In [97]:
# from joblib import Parallel, delayed

In [98]:
catch_all = Parallel(n_jobs = 15, verbose = 10)(delayed(get_results)(idx,var_weights = 1.0, var_weight_weights = 2.0, inflation_factor =1.0, fudging_beta = beta(1,11)) for idx in range(0,50))

[Parallel(n_jobs=15)]: Using backend LokyBackend with 15 concurrent workers.


done for fold10
train_width[1.383769858105957, 26.70667694807556]
test_width[1.1702962251148679, 23.71047411605225]
smiles_weight0.722785503716166
rmse_train[0.257842306094016, 6.134194283141293]
rmse_test[0.25623392180548515, 6.369228967405862]
smiles_weight_ci[0.5918996619066429, 0.8007196902192004]
done for fold3
train_width[1.131138155808941, 25.256141350740307]
test_width[0.8828260855358716, 21.260202302716586]
smiles_weight0.7603473937206106
rmse_train[0.2260089419817095, 6.1626511590900765]
rmse_test[0.2661986364001339, 5.944427073476333]
smiles_weight_ci[0.6297783235391287, 0.8388915810517905]


[Parallel(n_jobs=15)]: Done   2 tasks      | elapsed:  2.9min


done for fold14
train_width[1.193399030121784, 23.056332692073077]
test_width[0.921441051964453, 19.720675924526823]
smiles_weight0.765432890399967
rmse_train[0.21752952699702724, 4.83551075705332]
rmse_test[0.4314010987923803, 11.178773530495858]
smiles_weight_ci[0.6263691631414152, 0.8431366291178731]
done for fold4
train_width[1.233305198288065, 25.311349751425446]
test_width[0.9511414787041601, 20.144779125310077]
smiles_weight0.734594350184862
rmse_train[0.2495899617556342, 5.877236643636227]
rmse_test[0.24081019949031393, 5.442407128969248]
smiles_weight_ci[0.5963955505972731, 0.8256024754327442]
done for fold5
train_width[1.1097237368809063, 23.24846146928907]
test_width[1.0007183887766697, 22.11367187249369]
smiles_weight0.710351938535847
rmse_train[0.25654319219499033, 5.5399598436621265]
rmse_test[0.7718502122364895, 7.906333606578988]
smiles_weight_ci[0.5875140770874827, 0.7972196638331257]
done for fold1
train_width[1.300792397840385, 22.370514501738423]
test_width[0.883249

[Parallel(n_jobs=15)]: Done  11 tasks      | elapsed:  3.5min


done for fold12
train_width[0.9672371576398279, 19.017440411154702]
test_width[0.6996655008715539, 16.252279314094732]
smiles_weight0.762485786451626
rmse_train[0.21862794205463287, 4.458161611826107]
rmse_test[0.2942152366413473, 6.9053779182728645]
smiles_weight_ci[0.6489825456046031, 0.8444406555161174]
done for fold7
train_width[1.0134539875655812, 18.46902621156467]
test_width[0.7934724870456946, 16.464674886833286]
smiles_weight0.732328851777601
rmse_train[0.17975982647132802, 5.648960199009696]
rmse_test[0.2954465995984862, 6.757968997954579]
smiles_weight_ci[0.6140722761481177, 0.8220292386253257]
done for fold8
train_width[0.8155897269487488, 17.11280221623901]
test_width[0.6634738319105634, 15.95501097443364]
smiles_weight0.7420023743556221
rmse_train[0.1692896618384954, 4.382509570586058]
rmse_test[0.2198836191076287, 4.921364272011749]
smiles_weight_ci[0.6284076758421356, 0.81953156634015]
done for fold9
train_width[0.9127053498288296, 19.41724300131906]
test_width[0.591624

[Parallel(n_jobs=15)]: Done  20 tasks      | elapsed:  6.2min


done for fold18
train_width[1.0255355587787167, 17.978169694590715]
test_width[0.6297273930984045, 13.094678006862312]
smiles_weight0.8031421775314604
rmse_train[0.19184763950273948, 4.096748473516895]
rmse_test[0.24636196084862008, 3.6470274420505047]
smiles_weight_ci[0.6707021218818406, 0.8839173621764638]
done for fold20
train_width[1.093410034710873, 25.844131650528368]
test_width[0.8949781803504886, 21.89702885152485]
smiles_weight0.701786969704219
rmse_train[0.2515596067785892, 6.585656138973801]
rmse_test[0.21503749685657136, 5.496708733798464]
smiles_weight_ci[0.5805941864761949, 0.7785573958235862]
done for fold25
train_width[1.3101949665614618, 23.931823784754886]
test_width[0.869483662858647, 16.87877068578701]
smiles_weight0.8253763895955214
rmse_train[0.2852115605227443, 4.379427435162486]
rmse_test[0.7014251164395683, 6.523338240624585]
smiles_weight_ci[0.6696913329183231, 0.9033533878605953]
done for fold19
train_width[0.9161758640105783, 18.78458480333741]
test_width[0.

[Parallel(n_jobs=15)]: Done  27 out of  50 | elapsed:  7.2min remaining:  6.2min


done for fold24
train_width[0.7355547199282855, 14.007290316009211]
test_width[0.46651112175897563, 10.850178775495376]
smiles_weight0.7580051992143725
rmse_train[0.14935625309369735, 3.7687785131097717]
rmse_test[0.1877624739085207, 3.284207517900468]
smiles_weight_ci[0.6373770734998397, 0.832641676720404]
done for fold29
train_width[0.9390021647846786, 18.858436762473005]
test_width[0.6798784457765339, 16.42441914619603]
smiles_weight0.7692083912493403
rmse_train[0.16069792443679398, 4.435685530713067]
rmse_test[0.21635222089595146, 15.214219227979267]
smiles_weight_ci[0.6546836452284445, 0.8498767727821674]
done for fold30
train_width[1.1162859517086676, 23.426754360731486]
test_width[0.7503381281989759, 17.58750815641273]
smiles_weight0.7689199365429805
rmse_train[0.24663333942938534, 4.652510392686731]
rmse_test[0.24567291353331236, 4.442502637342233]
smiles_weight_ci[0.6454780131803132, 0.8516762504751134]
done for fold34
train_width[1.4091095788354888, 30.855115430151873]
test_w

[Parallel(n_jobs=15)]: Done  33 out of  50 | elapsed:  9.3min remaining:  4.8min


done for fold26
train_width[0.9727860747139109, 15.612291868467072]
test_width[0.4034619623471197, 9.223620947963642]
smiles_weight0.8378259318232995
rmse_train[0.13459793126561242, 3.3695071746193905]
rmse_test[0.25277810292560093, 4.53891502994718]
smiles_weight_ci[0.7072618828220528, 0.9034041180676164]
done for fold35
train_width[1.1770041442713242, 21.204240693590783]
test_width[0.8907865029412618, 18.267642160113066]
smiles_weight0.7208097893562785
rmse_train[0.2286842526550921, 5.020764522927078]
rmse_test[0.19996292474741678, 5.26730470865548]
smiles_weight_ci[0.6028638793639995, 0.8117339035395609]
done for fold39
train_width[1.6986903652608483, 35.75172498898063]
test_width[1.4537948471502877, 31.402984763264158]
smiles_weight0.6903975289190067
rmse_train[0.32255999918652106, 7.656626267807969]
rmse_test[0.2731170286230813, 7.22935851438235]
smiles_weight_ci[0.5698176500725314, 0.7844502639677665]
done for fold36
train_width[1.3248103585921567, 30.15049099906597]
test_width[1

[Parallel(n_jobs=15)]: Done  39 out of  50 | elapsed: 10.3min remaining:  2.9min


done for fold37
train_width[0.8509376742406983, 19.146456058476673]
test_width[0.6743875495966429, 16.35029637060577]
smiles_weight0.7191243829512606
rmse_train[0.2059574454620398, 4.8853425778025095]
rmse_test[0.26023313873034837, 4.342642848011997]
smiles_weight_ci[0.6043610154966433, 0.8136893261890443]
done for fold41
train_width[1.0108816484936058, 20.73799312241606]
test_width[0.7736455902334028, 16.732227365307367]
smiles_weight0.7283470392486137
rmse_train[0.1790137997500226, 5.071590094894049]
rmse_test[0.2134201117953949, 5.225678064397543]
smiles_weight_ci[0.6065203012473064, 0.8134661442375807]
done for fold44
train_width[1.7808179387200673, 38.487986711552686]
test_width[1.8111629479886737, 38.63791276133568]
smiles_weight0.6202678120708701
rmse_train[0.3202632081256564, 8.778254535603335]
rmse_test[1.853273719288336, 36.53894944901309]
smiles_weight_ci[0.5041108487891021, 0.7140590875963859]
done for fold43
train_width[1.3436424146903565, 25.898250551181988]
test_width[1.

[Parallel(n_jobs=15)]: Done  45 out of  50 | elapsed: 10.8min remaining:  1.2min


done for fold48
train_width[1.0845474245344755, 23.38550072922642]
test_width[0.7515629106116978, 18.49769663591405]
smiles_weight0.7537290174455951
rmse_train[0.21931101216837523, 5.912358420210354]
rmse_test[0.24232410462948747, 5.808656836587668]
smiles_weight_ci[0.624509558537783, 0.835175409317446]
done for fold47
train_width[1.3595624452164365, 23.319959701476172]
test_width[0.9451071024241947, 17.118753740398315]
smiles_weight0.8170264981956588
rmse_train[0.18327655518198513, 4.970520215693674]
rmse_test[0.25465717882491395, 4.250785511329764]
smiles_weight_ci[0.6783631695138149, 0.8881797154693055]
done for fold45
train_width[1.0119630702254458, 18.313373521451712]
test_width[0.690238904913027, 14.51333312450441]
smiles_weight0.7679207009360155
rmse_train[0.21296398756114507, 3.8866964548591825]
rmse_test[0.23765227550361098, 4.267008990154959]
smiles_weight_ci[0.6337398945135226, 0.8571280677127557]
done for fold49
train_width[1.2717776774241398, 23.056521984875406]
test_width

[Parallel(n_jobs=15)]: Done  50 out of  50 | elapsed: 11.9min finished


In [136]:
# item

In [137]:
# catch_all[-1]

In [138]:
# catch_inner

In [99]:
all_catch = []
for item in catch_all:
    catch_inner = []
    for inner in item:
        if type(inner) == list:
            for inner1 in inner:
                catch_inner.append(inner1)
        if type(inner) != list:
            catch_inner.append(inner)
    all_catch.append(catch_inner)

In [100]:
results_df = pd.DataFrame(all_catch)

In [101]:
results_df.shape

(50, 15)

In [142]:
# results_df

In [143]:
# results_df.iloc[:,-1].mean()

In [102]:
col_names = ["Alop_Train_Width", "PSA_Train_Width", "Alop_Train_Coverage", "PSA_Train_Coverage", 
            "Alop_Train_RMSE", "PSA_Train_RMSE", "Alop_Test_Width", "PSA_Test_Width", "Alop_Test_Coverage", "PSA_Test_Coverage", 
            "Alop_Test_RMSE", "PSA_Test_RMSE", "Smiles_Avg_Weight", "Lower_Interval_Smiles_Weight", "Upper_Interval_Smiles_Weight"]

In [145]:
# results_df.head()

In [103]:
results_df.columns = col_names

In [104]:
results_df["indicator"] = (results_df["Lower_Interval_Smiles_Weight"].values < 0.70) & (results_df["Upper_Interval_Smiles_Weight"].values >= 0.70)

In [105]:
np.mean(results_df["indicator"])

0.98

In [106]:
results_df["width_weight_CI"] = results_df["Upper_Interval_Smiles_Weight"].values - results_df["Lower_Interval_Smiles_Weight"].values

In [107]:
results_df.mean().reset_index()

Unnamed: 0,index,0
0,Alop_Train_Width,1.132644
1,PSA_Train_Width,22.498574
2,Alop_Train_Coverage,0.963533
3,PSA_Train_Coverage,0.955271
4,Alop_Train_RMSE,0.230676
5,PSA_Train_RMSE,5.205049
6,Alop_Test_Width,0.837056
7,PSA_Test_Width,18.570725
8,Alop_Test_Coverage,0.922917
9,PSA_Test_Coverage,0.931833


In [108]:
results_df.to_csv("..//Data//smiles_rdkit_70_30__with_cov_minus_0.27_Simulation_added_beat_noise.csv", index = False)

In [109]:
results_df = pd.read_csv("..//Data//smiles_rdkit_70_30__with_cov_minus_0.27_Simulation_added_beat_noise.csv")

In [111]:
results_df.mean().reset_index()

Unnamed: 0,index,0
0,Alop_Train_Width,1.132644
1,PSA_Train_Width,22.498574
2,Alop_Train_Coverage,0.963533
3,PSA_Train_Coverage,0.955271
4,Alop_Train_RMSE,0.230676
5,PSA_Train_RMSE,5.205049
6,Alop_Test_Width,0.837056
7,PSA_Test_Width,18.570725
8,Alop_Test_Coverage,0.922917
9,PSA_Test_Coverage,0.931833


In [110]:
results_df[results_df["indicator"] == False]

Unnamed: 0,Alop_Train_Width,PSA_Train_Width,Alop_Train_Coverage,PSA_Train_Coverage,Alop_Train_RMSE,PSA_Train_RMSE,Alop_Test_Width,PSA_Test_Width,Alop_Test_Coverage,PSA_Test_Coverage,Alop_Test_RMSE,PSA_Test_RMSE,Smiles_Avg_Weight,Lower_Interval_Smiles_Weight,Upper_Interval_Smiles_Weight,indicator,width_weight_CI
26,0.972786,15.612292,0.956885,0.955494,0.134598,3.369507,0.403462,9.223621,0.575,0.854167,0.252778,4.538915,0.837826,0.707262,0.903404,False,0.196142


In [153]:
# results_df.mean().reset_index()

In [154]:
# results_df.std()