In [85]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.feature_selection import VarianceThreshold
import random
import pickle
from sklearn.preprocessing import StandardScaler
import os
import tensorflow as tf
from tqdm.notebook import tqdm
from scipy.stats import multivariate_normal as mvn
import matplotlib.pyplot as plt
from scipy.linalg import block_diag
import warnings
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

In [86]:
warnings.filterwarnings('ignore')

In [87]:
def get_targets_with_weights(batch_data, initial_ensembles, size_ens): 
    
    target_dim = 1
    
    # weights_ann_1 = ann.get_weights()
    
    # h1  = ann.layers[1].output.shape[-1]

    n_hidden_1 = len(weights_ann_1[0].ravel())
    
    hidden_weights_1 = initial_ensembles[:,:n_hidden_1].reshape( size_ens, batch_data.shape[1], h1)
    
    
    hidden_output_1 = np.einsum('ij,kjl->kil', batch_data, hidden_weights_1)

    
    hidden_layer_bias_1 = initial_ensembles[:,n_hidden_1:(n_hidden_1 + h1)].reshape(size_ens, 1,  h1)


    hidden_output_1 = hidden_output_1 + hidden_layer_bias_1

    n_pred_weights_1 = len(weights_ann_1[2].ravel())

    output_weights_1 = initial_ensembles[:,(n_hidden_1 + h1):(n_hidden_1 + h1 + n_pred_weights_1) ].reshape(size_ens, h1, target_dim)


    output_1 = np.einsum('ijk,ikl->ijl', hidden_output_1, output_weights_1)


    output_layer_bias_1 = initial_ensembles[:,(n_hidden_1 + h1 + n_pred_weights_1):(n_hidden_1 + h1 + n_pred_weights_1 + target_dim)].reshape(size_ens, 1, target_dim)


    final_output_1 = output_1 + output_layer_bias_1
    
    final_output_1 = final_output_1[:,:, 0]
    
    # print(final_output_1.shape, initial_ensembles.shape)
    
    stack = np.hstack((final_output_1, initial_ensembles))

    
    return final_output_1, stack

In [88]:
def ann(hidden = 32, input_shape = 256, output_shape = 1): 
    input_layer = tf.keras.layers.Input(shape = (input_shape))
    hidden_layer = tf.keras.layers.Dense(hidden)
    hidden_output = hidden_layer(input_layer)
    pred_layer = tf.keras.layers.Dense(output_shape, activation = "relu")
    pred_output = pred_layer(hidden_output)
#     pred_output = tf.keras.layers.Activation("softmax")(pred_output)
    model = tf.keras.models.Model(input_layer, pred_output)
    return model

In [89]:
def generate_initial_ensembles(num_weights, lambda1, size_ens):
    mean_vec = np.zeros((num_weights,))
    cov_matrix = lambda1*np.identity(num_weights)
    mvn_samp = mvn(mean_vec, cov_matrix)
    return mvn_samp.rvs(size_ens)

In [90]:
def expit(x):
    """Compute softmax values for each sets of scores in x."""
#     e_x = np.exp(x - np.max(x))
    return 1 / (1 + np.exp(-x))

In [91]:
samp_ann =  ann(hidden = 16, input_shape = 32, output_shape = 1)

In [92]:
weights_ann_1 = samp_ann.get_weights()

In [93]:
h1  = samp_ann.layers[1].output.shape[-1]

In [94]:
samp_ann.count_params()

545

In [95]:
hidden_neurons = h1

In [96]:
samp_ann_params = samp_ann.count_params()

In [97]:
def get_initial_X_t(data1, data2, size_ens, var_weights = 1.0, var_weight_weights = 4.0):
    # samp_ann =  ann(hidden = hidden_neurons, input_shape = 32, output_shape = 1)
    
    initial_ensembles1 = generate_initial_ensembles(samp_ann_params, var_weights, size_ens)
    data1_out1, data1_stack1 = get_targets_with_weights(data1, initial_ensembles1, size_ens = size_ens)
    
    initial_ensembles2 = generate_initial_ensembles(samp_ann_params, var_weights, size_ens)
    data1_out2, data1_stack2 = get_targets_with_weights(data1, initial_ensembles2, size_ens = size_ens)
    
    initial_ensembles3 = generate_initial_ensembles(samp_ann_params, var_weights, size_ens)
    data2_out1, data2_stack1 = get_targets_with_weights(data2, initial_ensembles3, size_ens = size_ens)
    
    initial_ensembles4 = generate_initial_ensembles(samp_ann_params, var_weights, size_ens)
    data2_out2, data2_stack2 = get_targets_with_weights(data2, initial_ensembles4, size_ens = size_ens)   
    
    X_t = np.concatenate((np.expand_dims(data1_stack1, -1), np.expand_dims(data1_stack2, -1), 
                         np.expand_dims(data2_stack1, -1), np.expand_dims(data2_stack2, -1)), axis = -1)
    
    initial_ensembles_for_weights = generate_initial_ensembles(4, var_weight_weights, size_ens)
    initial_ensembles_for_weights = np.expand_dims(initial_ensembles_for_weights,1)
    
    initial_ensembles_for_L = generate_initial_ensembles(4, var_weights, size_ens)
    initial_ensembles_for_L = np.expand_dims(initial_ensembles_for_L,1)    
    
    initial_ensembles_for_D1 = generate_initial_ensembles(1, var_weights, size_ens).reshape(-1,1)
    initial_ensembles_for_D2 = generate_initial_ensembles(1, var_weights, size_ens).reshape(-1,1)
    
    initial_ensembles_for_D1_zero = np.zeros((size_ens,1,1)).reshape(-1,1)
    initial_ensembles_for_D2_zero = np.zeros((size_ens,1,1)).reshape(-1,1)
    
    initial_ensembles_for_D = np.concatenate((np.expand_dims(initial_ensembles_for_D1,1),
                                                       np.expand_dims(initial_ensembles_for_D1_zero,1), 
                                                      np.expand_dims(initial_ensembles_for_D2,1),
                                                       np.expand_dims(initial_ensembles_for_D2_zero,1)), axis = 2)
    
    # print(X_t.shape, initial_ensembles_for_weights.shape)
    
    X_t = np.concatenate((X_t, initial_ensembles_for_weights, initial_ensembles_for_L, initial_ensembles_for_D), axis = 1)
    
    initial_ensembles = np.hstack((initial_ensembles1, initial_ensembles2, initial_ensembles3, initial_ensembles4))
    
    return X_t, initial_ensembles, initial_ensembles_for_weights[:,0,:], initial_ensembles_for_L[:,0,:], initial_ensembles_for_D[:,0,:]

In [98]:
def get_weighted_targets_with_weights(batch_data, initial_ensembles, size_ens, weights): 
    
    target_dim = 1
    

    n_hidden_1 = len(weights_ann_1[0].ravel())
    
    hidden_weights_1 = initial_ensembles[:,:n_hidden_1].reshape( size_ens, batch_data.shape[1], h1)
    
    
    hidden_output_1 = np.einsum('ij,kjl->kil', batch_data, hidden_weights_1)

    
    hidden_layer_bias_1 = initial_ensembles[:,n_hidden_1:(n_hidden_1 + h1)].reshape(size_ens, 1,  h1)


    hidden_output_1 = hidden_output_1 + hidden_layer_bias_1

    n_pred_weights_1 = len(weights_ann_1[2].ravel())

    output_weights_1 = initial_ensembles[:,(n_hidden_1 + h1):(n_hidden_1 + h1 + n_pred_weights_1) ].reshape(size_ens, h1, target_dim)


    output_1 = np.einsum('ijk,ikl->ijl', hidden_output_1, output_weights_1)


    output_layer_bias_1 = initial_ensembles[:,(n_hidden_1 + h1 + n_pred_weights_1):(n_hidden_1 + h1 + n_pred_weights_1 + target_dim)].reshape(size_ens, 1, target_dim)


    final_output_1 = output_1 + output_layer_bias_1
    
    final_output_1 = final_output_1[:,:, 0]
    
    final_output_1 = final_output_1*weights
    
    # print(final_output_1.shape, initial_ensembles.shape)
    
    stack = np.hstack((final_output_1, initial_ensembles))

    
    return final_output_1, stack

In [99]:
std_targets = pickle.load( open('..//Data//target_scaler.pkl', 'rb'))

In [100]:
# R_t = np.array([[0.02, 0], [0, 0.02]])

In [101]:
# var1 = R_t[0,0]
# var2 = R_t[1,1]
# cov = R_t[1,0]

In [102]:
from scipy.stats import beta

In [103]:
fudging_beta = beta(1,9)

In [104]:
def forward_operation(data1, data2, combined_ensembles , size_ens, fudging_beta):
    # samp_ann =  ann(hidden = hidden_neurons, input_shape = 32, output_shape = 1)
    params = samp_ann_params
    initial_ensembles1 = combined_ensembles[:, :params]
    initial_ensembles2 = combined_ensembles[:, params:(2*params)]
    initial_ensembles3 = combined_ensembles[:, (2*params):(3*params)]
    initial_ensembles4 = combined_ensembles[:, (3*params):(4*params)]

    
    initial_ensembles_for_weights = combined_ensembles[:, (4*params):(4*params + 4)]
    
    initial_ensembles_for_L = combined_ensembles[:, (4*params + 4):(4*params + 4 + 4)]
    
    initial_ensembles_for_D = combined_ensembles[:,(4*params + 4 + 4):(4*params + 4 + 4 + 4)]
    
    
    softmax_weights = tf.math.softmax(initial_ensembles_for_weights).numpy()
    
    model_1 = softmax_weights[:, :2].sum(1).reshape(-1,1) +  fudging_beta.rvs(size_ens).reshape(-1,1)
    
    # model_1 = np.min(model_1 -fudging_factor)
    
    model_2 = softmax_weights[:, 2:].sum(1).reshape(-1,1) +  fudging_beta.rvs(size_ens).reshape(-1,1)
    
    
    model_1_plus_model_2 = model_1 + model_2
    
    model_1 = model_1/model_1_plus_model_2
    
    model_2 = model_2/model_1_plus_model_2
    
    
    # print(np.mean(model_1 + model_2))
    
    data1_out1, data1_stack1 = get_weighted_targets_with_weights(data1, initial_ensembles1, size_ens = size_ens,
                                                                  weights=model_1)
    
    data1_out2, data1_stack2 = get_weighted_targets_with_weights(data1, initial_ensembles2, size_ens = size_ens,
                                                                weights=model_1)
    
    data2_out1, data2_stack1 = get_weighted_targets_with_weights(data2, initial_ensembles3, size_ens = size_ens,
                                                                 weights=model_2)
    
    data2_out2, data2_stack2 = get_weighted_targets_with_weights(data2, initial_ensembles4, size_ens = size_ens,
                                                                  weights=model_2)   
    
    X_t = np.concatenate((np.expand_dims(data1_stack1, -1), np.expand_dims(data1_stack2, -1), 
                         np.expand_dims(data2_stack1, -1), np.expand_dims(data2_stack2, -1)), axis = -1)
    
    initial_ensembles = np.hstack((initial_ensembles1, initial_ensembles2, initial_ensembles3, initial_ensembles4, 
                        initial_ensembles_for_weights, initial_ensembles_for_L, initial_ensembles_for_D))
    
    # print(X_t.shape)
    
    initial_ensembles_for_weights = np.expand_dims(initial_ensembles_for_weights,1)
    
    initial_ensembles_for_L = np.expand_dims(initial_ensembles_for_L,1)
    
    initial_ensembles_for_D = np.expand_dims(initial_ensembles_for_D,1)
    
    # print(initial_ensembles_for_weights.shape)
    
    X_t = np.concatenate((X_t, initial_ensembles_for_weights, initial_ensembles_for_L, initial_ensembles_for_D), axis = 1)
    
    weighted_alogp = data1_out1 + data2_out1
    
    weighted_psa = data1_out2 + data2_out2
    
    return X_t, initial_ensembles, weighted_alogp, weighted_psa, model_1, model_2

In [105]:
def forward_operation_test(data1, data2, combined_ensembles , size_ens):
    # samp_ann =  ann(hidden = hidden_neurons, input_shape = 32, output_shape = 1)
    params = samp_ann_params
    initial_ensembles1 = combined_ensembles[:, :params]
    initial_ensembles2 = combined_ensembles[:, params:(2*params)]
    initial_ensembles3 = combined_ensembles[:, (2*params):(3*params)]
    initial_ensembles4 = combined_ensembles[:, (3*params):(4*params)]

    
    initial_ensembles_for_weights = combined_ensembles[:, (4*params):(4*params + 4)]
    
    initial_ensembles_for_L = combined_ensembles[:, (4*params + 4):(4*params + 4 + 4)]
    
    initial_ensembles_for_D = combined_ensembles[:,(4*params + 4 + 4):(4*params + 4 + 4 + 4)]
    
    
    softmax_weights = tf.math.softmax(initial_ensembles_for_weights).numpy()
    
    model_1 = softmax_weights[:, :2].sum(1).reshape(-1,1) 
    
    # model_1 = np.min(model_1 -fudging_factor)
    
    model_2 = softmax_weights[:, 2:].sum(1).reshape(-1,1) 
    
    
#     model_1_plus_model_2 = model_1 + model_2
    
#     model_1 = model_1/model_1_plus_model_2
    
#     model_2 = model_2/model_1_plus_model_2
    
    
    # print(np.mean(model_1 + model_2))
    
    data1_out1, data1_stack1 = get_weighted_targets_with_weights(data1, initial_ensembles1, size_ens = size_ens,
                                                                  weights=model_1)
    
    data1_out2, data1_stack2 = get_weighted_targets_with_weights(data1, initial_ensembles2, size_ens = size_ens,
                                                                weights=model_1)
    
    data2_out1, data2_stack1 = get_weighted_targets_with_weights(data2, initial_ensembles3, size_ens = size_ens,
                                                                 weights=model_2)
    
    data2_out2, data2_stack2 = get_weighted_targets_with_weights(data2, initial_ensembles4, size_ens = size_ens,
                                                                  weights=model_2)   
    
    X_t = np.concatenate((np.expand_dims(data1_stack1, -1), np.expand_dims(data1_stack2, -1), 
                         np.expand_dims(data2_stack1, -1), np.expand_dims(data2_stack2, -1)), axis = -1)
    
    initial_ensembles = np.hstack((initial_ensembles1, initial_ensembles2, initial_ensembles3, initial_ensembles4, 
                        initial_ensembles_for_weights, initial_ensembles_for_L, initial_ensembles_for_D))
    
    # print(X_t.shape)
    
    initial_ensembles_for_weights = np.expand_dims(initial_ensembles_for_weights,1)
    
    initial_ensembles_for_L = np.expand_dims(initial_ensembles_for_L,1)
    
    initial_ensembles_for_D = np.expand_dims(initial_ensembles_for_D,1)
    
    # print(initial_ensembles_for_weights.shape)
    
    X_t = np.concatenate((X_t, initial_ensembles_for_weights, initial_ensembles_for_L, initial_ensembles_for_D), axis = 1)
    
    weighted_alogp = data1_out1 + data2_out1
    
    weighted_psa = data1_out2 + data2_out2
    
    return X_t, initial_ensembles, weighted_alogp, weighted_psa, model_1, model_2

In [106]:
# samp_ann =  ann(hidden = 16, input_shape = 32, output_shape = 1)

In [107]:
total_weights = 4*(samp_ann.count_params() + 1 + 1 + 1)

In [108]:
reduction = 10

In [109]:
size_ens = total_weights//reduction

In [110]:
size_ens

219

In [111]:
G_t = [[1, 0, 1, 0], [0, 1, 0, 1]]
G_t = np.array(G_t).T

In [112]:
def get_predictions(data1, data2, initial_ensembles, fudging_beta  =fudging_beta): 
    _,_, weighted_alogp, weighted_psa, w1, w2 = forward_operation(data1, data2, initial_ensembles, size_ens = size_ens, fudging_beta = fudging_beta)
    weighted_alogp = np.expand_dims(weighted_alogp,-1)
    weighted_psa = np.expand_dims(weighted_psa,-1)
    preds = np.concatenate((weighted_alogp, weighted_psa),-1)
    return preds, w1, w2

In [113]:
def get_predictions_test(data1, data2, initial_ensembles): 
    _,_, weighted_alogp, weighted_psa, w1, w2 = forward_operation_test(data1, data2, initial_ensembles, size_ens = size_ens)
    weighted_alogp = np.expand_dims(weighted_alogp,-1)
    weighted_psa = np.expand_dims(weighted_psa,-1)
    preds = np.concatenate((weighted_alogp, weighted_psa),-1)
    return preds, w1, w2

In [114]:
def calculate_mu_bar_G_bar(data1, data2, initial_ensembles, fudging_beta):
    H_t = np.hstack((np.identity(data1.shape[0]), np.zeros((data1.shape[0], samp_ann_params + 1 + 1 + 1))))
    mu_bar = initial_ensembles.mean(0)
    X_t,_, _, _, _, _ = forward_operation(data1, data2, initial_ensembles, size_ens = size_ens, fudging_beta = fudging_beta)
    X_t = X_t.transpose((0,2,1))
    X_t = X_t.reshape(X_t.shape[0], X_t.shape[1]*X_t.shape[2])
    script_H_t = np.kron(G_t.T, H_t)
    G_u = (script_H_t@X_t.T)
    G_u = G_u.T
    G_bar = (G_u.mean(0)).ravel()
    return mu_bar.reshape(-1,1), G_bar.reshape(-1,1), G_u

In [115]:
def calculate_C_u(initial_ensembles, mu_bar, G_bar, G_u): 
    u_j_minus_u_bar = initial_ensembles - mu_bar.reshape(1,-1)
    G_u_minus_G_bar = G_u -  G_bar.reshape(1,-1)
    c = np.zeros((total_weights, G_bar.shape[0]))
    for i in range(0, size_ens): 
        c += np.kron(u_j_minus_u_bar[i, :].T.reshape(-1,1), G_u_minus_G_bar[i,:].reshape(-1,1).T)
    return c/size_ens, G_u_minus_G_bar

In [116]:
def calculate_D_u( G_bar, G_u): 
    G_u_minus_G_bar = G_u -  G_bar.reshape(1,-1)
    d = np.zeros((G_bar.shape[0], G_bar.shape[0]))
    for i in range(0, size_ens): 
        d += np.kron(G_u_minus_G_bar[i,:].T.reshape(-1,1), G_u_minus_G_bar[i,:].reshape(-1,1).T)
    return d/size_ens

In [117]:
def get_updated_ensemble(data1, data2, initial_ensembles, y_train, size_ens = size_ens, inflation_factor = 1.0, fudging_beta = fudging_beta):
    mu_bar, G_bar, G_u = calculate_mu_bar_G_bar(data1, data2, initial_ensembles, fudging_beta)
    C, G_u_minus_G_bar = calculate_C_u(initial_ensembles, mu_bar, G_bar, G_u)
    D = calculate_D_u( G_bar, G_u)
    _, R_t = create_cov(data1.shape[0],initial_ensembles)
    inflation = np.identity(R_t.shape[0])*inflation_factor
    D_plus_cov = D + (R_t *inflation_factor)
    D_plus_cov_inv = np.linalg.inv(D_plus_cov)
    mid_quant = C@D_plus_cov_inv
    noise_vec_mean = np.zeros((R_t.shape[0], ))
    noise_mvn = mvn(noise_vec_mean, R_t)
    fudging = noise_mvn.rvs(size_ens)
    interim = (y_train.T.flatten().reshape(1,-1) + fudging)
    right_quant = interim - G_u
    mid_times_right = mid_quant@right_quant.T
    updated_ensemble = (initial_ensembles + mid_times_right.T)
    return updated_ensemble

In [118]:
target_dim = 2

In [119]:
lambda_D = 1

In [120]:
def inverse_transform(data, idx):
    data_cur = data[idx, :, :]
    inv_data_cur = std_targets.inverse_transform(data_cur)
    return inv_data_cur

In [121]:
from joblib import Parallel, delayed

In [122]:
def create_cov(shape, initial_ensembles):
    cov_part = initial_ensembles[:, -8:-4]
    cov_part = cov_part.mean(0)
    # variances = tf.math.softplus(cov_part[:2]).numpy()
    variances = cov_part[:2]
    covariances = cov_part[2:]
    base_cov = np.identity(target_dim)
    base_cov[0,0] = variances[0]
    base_cov[1,1] = variances[1]
    base_cov[0,1] = covariances[0]
    base_cov[1,0] = covariances[1]
    
    variances1 = tf.math.softplus(initial_ensembles[:, -4:]).numpy()
    variances1 = variances1.mean(0)
    base_variances = np.identity(target_dim)
    base_variances[0,0] = variances1[0]
    base_variances[1,1] = variances1[2]
    
    final = np.linalg.cholesky(base_cov@base_cov.T + base_variances)
    cov_mat = final@final.T
    cov_mat_final = cov_mat
    # cov_mat_final = cov_mat@cov_mat.T
    
    if is_pos_def(cov_mat_final) != True:
        print("resulting cov matrix is not positive semi definite")
        pass
    
    # print(np.linalg.det(cov_mat_final))
    
    var1 = cov_mat_final[0,0]
    var2 = cov_mat_final[1,1]
    cov = cov_mat_final[1,0]

    n = shape
    
    ul = var1*np.identity(n)
    lr = var2*np.identity(n)
    ur = cov*np.identity(n)
    ll = ur.T    
    
    first_row = np.hstack((ul, ur))
    second_row = np.hstack((ll, lr))
    
    R_t = np.vstack((first_row, second_row))
    
    return cov_mat_final, R_t
    

In [123]:
def is_pos_def(x):
    return np.all(np.linalg.eigvals(x) > 0)

In [124]:
with open("..//Data//smiles_to_rdkit_70_30_with_cov_minus_0.27_var.pickle", "rb") as f: 
    catch = pickle.load(f)

In [125]:
# idx = 1

In [126]:
def prepare_data(idx, var_weights = 1.0, var_weight_weights = 4.0): 
    catch_idx = catch[idx]
    x_train, x_valid, y_train, y_valid = catch_idx[0], catch_idx[1], catch_idx[2], catch_idx[3]
    y_train_actual, y_train = y_train[:,:2], y_train[:,2:]
    y_valid_actual, y_valid = y_valid[:,:2], y_valid[:,2:]
    smiles_feats_train = x_train[:, :32]
    rdkit_feats_train = x_train[:, 32:]
    smiles_feats_valid = x_valid[:, :32]
    rdkit_feats_valid = x_valid[:, 32:]

    X_t, initial_ensembles, initial_ensembles_for_weights, initial_ensembles_for_L, initial_ensembles_for_D = get_initial_X_t(smiles_feats_train, rdkit_feats_train, size_ens = size_ens, var_weights = var_weights, var_weight_weights = var_weight_weights)
    initial_ensembles = np.hstack((initial_ensembles, initial_ensembles_for_weights, initial_ensembles_for_L, initial_ensembles_for_D))
    
    return smiles_feats_train, rdkit_feats_train, smiles_feats_valid, rdkit_feats_valid, y_train, y_train_actual, y_valid, y_valid_actual, initial_ensembles 

In [127]:
# smiles_feats_train, rdkit_feats_train, smiles_feats_valid, rdkit_feats_valid, y_train, y_train_actual, y_valid, y_valid_actual, initial_ensembles  = prepare_data(idx)

In [128]:
# y_train

In [129]:
# y_train_actual

In [130]:
from scipy.linalg import norm

In [131]:
# beta(1,19).rvs(size_ens).mean()

In [132]:
def get_results(idx, var_weights = 1.0, var_weight_weights = 1.0, inflation_factor = 1.2, fudging_beta = beta(1,19)):
    # print('var_weights' + str(var_weights))
    # print('inflation_factor' + str(inflation_factor))
    # print('var_weight_weights' + str(var_weight_weights))
    smiles_feats_train, rdkit_feats_train, smiles_feats_valid, rdkit_feats_valid, y_train, y_train_actual, y_valid, y_valid_actual, initial_ensembles  = prepare_data(idx, var_weights = var_weights, var_weight_weights =var_weight_weights)
    # print(R_t.shape)
    best_train_width_mean = 100000
    
    for i in range(0,10000):
        # print(i)
    
        c = np.zeros((2,2))
        initial_ensembles = get_updated_ensemble(smiles_feats_train, rdkit_feats_train, initial_ensembles, y_train, size_ens, inflation_factor = inflation_factor, fudging_beta = fudging_beta)
        # print(inflation_factor)
        G_u_train, w1, w2 = get_predictions(smiles_feats_train, rdkit_feats_train, initial_ensembles, fudging_beta)

        catch = Parallel(n_jobs = 15, verbose = 0)(delayed(inverse_transform)(G_u_train, i)  for i in range(G_u_train.shape[0]))
        G_u_train = np.array(catch)
    
        y_train_cur = std_targets.inverse_transform(y_train_actual)
    
        li_train = np.percentile(G_u_train, axis = 0, q = (2.5, 97.5))[0,:,:]   
        ui_train = np.percentile(G_u_train, axis = 0, q = (2.5, 97.5))[1,:,:]
    
        width_train = ui_train - li_train
        avg_width_train = width_train.mean(0)
    
        ind_train = (y_train_cur >= li_train) & (y_train_cur <= ui_train)
        coverage_train= ind_train.mean(0)
    
        averaged_targets_train = G_u_train.mean(0)
        rmse_train = np.sqrt(((y_train_cur -averaged_targets_train)**2).mean(0))
    # print(rmse_train, coverage_train, avg_width_train)
    
        G_u_test, _, _ = get_predictions_test(smiles_feats_valid, rdkit_feats_valid, initial_ensembles)
    
        catch = Parallel(n_jobs = 15, verbose = 0)(delayed(inverse_transform)(G_u_test, i)  for i in range(G_u_test.shape[0]))
        G_u_test = np.array(catch)
    
        y_valid_cur = std_targets.inverse_transform(y_valid_actual)    
    
        li_test = np.percentile(G_u_test, axis = 0, q = (2.5, 97.5))[0,:,:]   
        ui_test = np.percentile(G_u_test, axis = 0, q = (2.5, 97.5))[1,:,:]
    
        width_test = ui_test - li_test
        avg_width_test = width_test.mean(0)
    
        ind_test = (y_valid_cur >= li_test) & (y_valid_cur <= ui_test)
        coverage_test= ind_test.mean(0)
    
        averaged_targets_test = G_u_test.mean(0)
        rmse_test = np.sqrt(((y_valid_cur -averaged_targets_test)**2).mean(0))    
    
        # weight_norms = np.array(norm(initial_ensembles, ord = 2, axis = 1))
        # weight_norm_mean.append(weight_norms.mean())
        # weight_norm_sd.append(weight_norms.std())
    
        cov_mat_final, _ = create_cov(smiles_feats_train.shape[0],initial_ensembles)
        
        # print("standardized_scale_R_t")
        # print(np.diag(cov_mat_final), cov_mat_final[0,1])
        
        # print(w1.shape)
        
        li_smiles_weight = np.percentile(w1, axis = 0, q = (2.5, 97.5))[0][0]
        
        # print(np.percentile(w1, axis = 0, q = (2.5, 97.5)))
        
        ui_smiles_weight = np.percentile(w1, axis = 0, q = (2.5, 97.5))[1][0]      
        
        # print(coverage_train.tolist(), avg_width_train.tolist(), rmse_train.tolist())
        # print(coverage_test.tolist(), avg_width_test.tolist(), rmse_test.tolist())
        # print(w1.mean(), w1.std())
        # print(li_smiles_weight, ui_smiles_weight)
        # print(avg_width_train.tolist(), coverage_train.tolist(), rmse_train.tolist(), avg_width_test.tolist(), coverage_test.tolist(), rmse_test.tolist(), w1.mean())

        if (avg_width_train.mean() < best_train_width_mean) & (coverage_train.mean() > 0.95): 
            # print("went here")
            best_train_width_mean = avg_width_train.mean()
            best_train_width = avg_width_train
            best_smiles_weight = w1.mean()
            best_coverage_train = coverage_train
            best_rmse_train = rmse_train
        
            best_test_width = avg_width_test

            best_coverage_test = coverage_test    
            best_rmse_test = rmse_test
            
            best_li_smiles_weight = li_smiles_weight
            
            best_ui_smiles_weight = ui_smiles_weight
    
        if coverage_train.mean() < 0.95:
            
            # print()
            # print(best_train_width.tolist(), best_coverage_train.tolist(), best_rmse_train.tolist(), best_test_width.tolist(), best_coverage_test.tolist(), best_rmse_test.tolist(), best_smiles_weight, flush = True)
            print("done for fold" + str(idx), flush = True)
            print("train_width" + str(best_train_width.tolist()), flush = True)
            print("test_width" + str(best_test_width.tolist()), flush = True)
            print("smiles_weight" + str(best_smiles_weight), flush = True)
            print("rmse_train" + str(best_rmse_train.tolist()), flush = True)
            print("rmse_test" + str(best_rmse_test.tolist()), flush = True)
            print("smiles_weight_ci" + str([best_li_smiles_weight, best_ui_smiles_weight]), flush = True)
            
            return [best_train_width.tolist(), best_coverage_train.tolist(), best_rmse_train.tolist(), best_test_width.tolist(), best_coverage_test.tolist(), best_rmse_test.tolist(), best_smiles_weight, best_li_smiles_weight, best_ui_smiles_weight]


In [82]:
# results_df[results_df["indicator"] == False]

In [133]:
%%time
# get_results(45, var_weights = 1.0, var_weight_weights = 2.0, inflation_factor =1.0, fudging_beta = beta(1,11))

CPU times: user 4 µs, sys: 3 µs, total: 7 µs
Wall time: 12.4 µs


In [134]:
# from joblib import Parallel, delayed

In [135]:
catch_all = Parallel(n_jobs = 15, verbose = 10)(delayed(get_results)(idx,var_weights = 1.0, var_weight_weights = 2.0, inflation_factor =1.0, fudging_beta = beta(1,11)) for idx in range(0,50))

[Parallel(n_jobs=15)]: Using backend LokyBackend with 15 concurrent workers.


done for fold2
train_width[1.342344545339677, 29.446403530445718]
test_width[1.14623533222744, 26.52427261438076]
smiles_weight0.6946662916688016
rmse_train[0.2580343338435064, 8.230185829805214]
rmse_test[0.2007145748718262, 7.2726991094572675]
smiles_weight_ci[0.5896863547917883, 0.7771886028037417]
done for fold13
train_width[1.2695542583529316, 28.858615419372047]
test_width[1.121820970398027, 25.930315801058576]
smiles_weight0.7401562459133153
rmse_train[0.25721261678979224, 7.691257075726726]
rmse_test[0.2732691918441032, 7.352549596487283]
smiles_weight_ci[0.6292809738107403, 0.8290177595599642]


[Parallel(n_jobs=15)]: Done   2 tasks      | elapsed:  4.6min


done for fold3
train_width[1.0504821938360112, 23.145921633737903]
test_width[0.844149569015977, 18.378450474991233]
smiles_weight0.7008752811533934
rmse_train[0.23109236033716227, 5.5433687305171935]
rmse_test[0.19535947441404855, 5.470823871625542]
smiles_weight_ci[0.592449040305265, 0.7995745213422307]
done for fold4
train_width[1.1650510334234012, 21.011107023526623]
test_width[0.8330887280652592, 17.37391221949856]
smiles_weight0.7584343727881501
rmse_train[0.2578623395921502, 5.098406479784484]
rmse_test[0.2228029151099859, 5.468639752587699]
smiles_weight_ci[0.657278435542009, 0.8433854681011782]
done for fold0
train_width[1.3009392252717864, 24.561710159288882]
test_width[0.9455052359163918, 19.536938550115504]
smiles_weight0.8048454391265286
rmse_train[0.21379407811513923, 6.347951875089702]
rmse_test[0.24673393923919618, 5.2972972003973915]
smiles_weight_ci[0.6855218405927496, 0.8805369807803635]
done for fold1
train_width[1.0856283447737274, 21.31344171576643]
test_width[0.8

[Parallel(n_jobs=15)]: Done  11 tasks      | elapsed:  5.4min


done for fold7
train_width[0.9749942095889056, 20.188351423497622]
test_width[0.6817383503027712, 15.993317016778663]
smiles_weight0.7647106582749054
rmse_train[0.20797435134652886, 4.262737015384224]
rmse_test[0.20204581446536887, 3.6787973383106642]
smiles_weight_ci[0.6544855800989602, 0.8501359512741514]
done for fold8
train_width[0.9811649113197165, 20.357484008674753]
test_width[0.6762970399705198, 15.99527052566849]
smiles_weight0.7502554248193873
rmse_train[0.19819127120598215, 4.841482282905202]
rmse_test[0.25643475195607257, 3.8886635379830556]
smiles_weight_ci[0.6372807770090502, 0.8374547380158175]
done for fold5
train_width[1.0249306304898913, 17.93345710020641]
test_width[0.6683439655474837, 14.982152799472725]
smiles_weight0.7536005302223517
rmse_train[0.2259932468974626, 4.318130433094611]
rmse_test[0.2734856836742981, 4.633793315951064]
smiles_weight_ci[0.639823281152588, 0.8345096096757998]
done for fold11
train_width[0.9438846741993865, 16.261745210379985]
test_width[

[Parallel(n_jobs=15)]: Done  20 tasks      | elapsed:  9.3min


done for fold23
train_width[1.0969114566783562, 26.58541562888578]
test_width[0.9935427709992608, 24.22659567770113]
smiles_weight0.7054934382668328
rmse_train[0.2160121003167035, 5.4292960100125285]
rmse_test[0.36058810807210173, 15.717683882971631]
smiles_weight_ci[0.6101802825782684, 0.7936799443888948]
done for fold18
train_width[0.935137014886854, 18.01035850793979]
test_width[0.5586456921079909, 13.604549645821201]
smiles_weight0.8282063585379403
rmse_train[0.18040239130341207, 4.678052166274092]
rmse_test[0.2850381456604789, 5.310595552205211]
smiles_weight_ci[0.7164300642638577, 0.8981619395351573]
done for fold27
train_width[1.3637567888528437, 22.699519459459648]
test_width[0.9901918835651347, 18.822622879677745]
smiles_weight0.7610801083675369
rmse_train[0.2705834864880231, 5.512161132651812]
rmse_test[0.24247487881043633, 4.484260746353197]
smiles_weight_ci[0.6534649927681497, 0.8425537016299365]
done for fold26
train_width[1.2900594660343419, 23.251592576224247]
test_width

[Parallel(n_jobs=15)]: Done  27 out of  50 | elapsed: 10.4min remaining:  8.8min


done for fold28
train_width[1.0441387690825263, 21.12297239656916]
test_width[0.7889747911655601, 16.66827207507826]
smiles_weight0.7381498250249339
rmse_train[0.22738084879200798, 4.868646763939097]
rmse_test[0.23728917596035812, 4.623696717647783]
smiles_weight_ci[0.6256578676703913, 0.8141353296046844]
done for fold25
train_width[1.0169451246596626, 18.329058758063464]
test_width[0.6309792872210356, 14.008310840104322]
smiles_weight0.8007331806749001
rmse_train[0.22726495305178857, 4.441501909954528]
rmse_test[0.23968097647288306, 3.4955410836964877]
smiles_weight_ci[0.6794435205957478, 0.8725261603943306]
done for fold20
train_width[0.7652679565246745, 15.177796716731049]
test_width[0.504050338282113, 11.035096161315092]
smiles_weight0.7500864800270122
rmse_train[0.1896545012859573, 3.9527107169418416]
rmse_test[0.19490720165829623, 3.480232267260877]
smiles_weight_ci[0.6507707936158784, 0.8213436587785828]
done for fold29
train_width[0.961083202150988, 19.047637774650944]
test_wid

[Parallel(n_jobs=15)]: Done  33 out of  50 | elapsed: 13.5min remaining:  7.0min


done for fold33
train_width[1.2913845758084128, 23.36604780650913]
test_width[0.8666965182556723, 17.374587629891025]
smiles_weight0.8135325848114539
rmse_train[0.23922337424269086, 5.64231387710761]
rmse_test[0.23877203152646018, 5.197952437918724]
smiles_weight_ci[0.6867213479082427, 0.892877930065138]
done for fold34
train_width[1.089222491702365, 21.599005246859527]
test_width[0.8746048231937877, 18.52165323341435]
smiles_weight0.7436097988478021
rmse_train[0.22890710899820566, 4.72929876924728]
rmse_test[0.24071635293553859, 4.413066834874326]
smiles_weight_ci[0.6069636649221681, 0.8251225402962746]
done for fold41
train_width[1.622117683046341, 31.96053446193032]
test_width[1.2980178403192193, 26.526418187339065]
smiles_weight0.7680452939957555
rmse_train[0.35350224684181836, 7.90699767744089]
rmse_test[0.33293398707375604, 6.481423454450227]
smiles_weight_ci[0.6368648473889751, 0.8511739587710891]
done for fold31
train_width[0.9637655288366379, 19.139220678102518]
test_width[0.8

[Parallel(n_jobs=15)]: Done  39 out of  50 | elapsed: 14.6min remaining:  4.1min


done for fold36
train_width[0.9674797590968744, 17.542857220070886]
test_width[0.6781859554147603, 14.892593320451024]
smiles_weight0.7798365046160911
rmse_train[0.19679677256922826, 4.208404570662663]
rmse_test[0.36782070279830725, 5.3684928495994315]
smiles_weight_ci[0.675671798226206, 0.861417387298091]
done for fold37
train_width[0.8434938027415355, 16.517200913016843]
test_width[0.5901355130312427, 14.298565194530374]
smiles_weight0.768124069687399
rmse_train[0.17797303810745257, 3.7383011443964085]
rmse_test[0.2214868584413996, 3.6825718538145127]
smiles_weight_ci[0.6590372482203521, 0.843930618711745]
done for fold42
train_width[1.128156109341176, 20.205942403890734]
test_width[0.782438074165966, 15.567797453542905]
smiles_weight0.7663456046358178
rmse_train[0.21446015389716114, 5.253212770531536]
rmse_test[0.23047129918097392, 4.447073057995823]
smiles_weight_ci[0.6293893606025084, 0.8440999769199092]
done for fold38
train_width[1.0365278412796615, 16.231254660297786]
test_widt

[Parallel(n_jobs=15)]: Done  45 out of  50 | elapsed: 15.5min remaining:  1.7min


done for fold46
train_width[1.0517779694415559, 19.97656473766907]
test_width[0.7718698525483283, 17.025558820842768]
smiles_weight0.7497137281128665
rmse_train[0.20855255945574921, 4.776260312860472]
rmse_test[0.17285157688143113, 3.8279938820086006]
smiles_weight_ci[0.6060602775730572, 0.8371752036207404]
done for fold47
train_width[1.2665098350996296, 23.06101673490192]
test_width[0.9211068864642971, 19.07214197347718]
smiles_weight0.7648178927081588
rmse_train[0.2170355891782304, 5.583943552050668]
rmse_test[0.441329842948549, 23.475718609809732]
smiles_weight_ci[0.6409832875951731, 0.847354446133436]
done for fold49
train_width[1.0436640583701828, 22.048658867018954]
test_width[0.8737650524015557, 19.3769923282389]
smiles_weight0.667033457469279
rmse_train[0.28053562037042296, 5.92496420005376]
rmse_test[0.2541233192524328, 5.089699162996322]
smiles_weight_ci[0.5648400154823114, 0.7702158964837837]
done for fold48
train_width[1.092268411513275, 23.355135899425946]
test_width[0.991

[Parallel(n_jobs=15)]: Done  50 out of  50 | elapsed: 17.5min finished


In [136]:
# item

In [137]:
# catch_all[-1]

In [138]:
# catch_inner

In [139]:
all_catch = []
for item in catch_all:
    catch_inner = []
    for inner in item:
        if type(inner) == list:
            for inner1 in inner:
                catch_inner.append(inner1)
        if type(inner) != list:
            catch_inner.append(inner)
    all_catch.append(catch_inner)

In [140]:
results_df = pd.DataFrame(all_catch)

In [141]:
results_df.shape

(50, 15)

In [142]:
# results_df

In [143]:
# results_df.iloc[:,-1].mean()

In [144]:
col_names = ["Alop_Train_Width", "PSA_Train_Width", "Alop_Train_Coverage", "PSA_Train_Coverage", 
            "Alop_Train_RMSE", "PSA_Train_RMSE", "Alop_Test_Width", "PSA_Test_Width", "Alop_Test_Coverage", "PSA_Test_Coverage", 
            "Alop_Test_RMSE", "PSA_Test_RMSE", "Smiles_Avg_Weight", "Lower_Interval_Smiles_Weight", "Upper_Interval_Smiles_Weight"]

In [145]:
# results_df.head()

In [146]:
results_df.columns = col_names

In [147]:
results_df["indicator"] = (results_df["Lower_Interval_Smiles_Weight"].values < 0.70) & (results_df["Upper_Interval_Smiles_Weight"].values >= 0.70)

In [148]:
np.mean(results_df["indicator"])

0.96

In [149]:
results_df["width_weight_CI"] = results_df["Upper_Interval_Smiles_Weight"].values - results_df["Lower_Interval_Smiles_Weight"].values

In [150]:
results_df.mean().reset_index()

Unnamed: 0,index,0
0,Alop_Train_Width,1.111646
1,PSA_Train_Width,21.66327
2,Alop_Train_Coverage,0.964951
3,PSA_Train_Coverage,0.952378
4,Alop_Train_RMSE,0.230081
5,PSA_Train_RMSE,5.203101
6,Alop_Test_Width,0.825925
7,PSA_Test_Width,17.975331
8,Alop_Test_Coverage,0.914667
9,PSA_Test_Coverage,0.93675


In [151]:
results_df.to_csv("..//Data//smiles_rdkit_70_30__with_cov_minus_0.27_Simulation_added_beat_noise.csv", index = False)

In [152]:
results_df[results_df["indicator"] == False]

Unnamed: 0,Alop_Train_Width,PSA_Train_Width,Alop_Train_Coverage,PSA_Train_Coverage,Alop_Train_RMSE,PSA_Train_RMSE,Alop_Test_Width,PSA_Test_Width,Alop_Test_Coverage,PSA_Test_Coverage,Alop_Test_RMSE,PSA_Test_RMSE,Smiles_Avg_Weight,Lower_Interval_Smiles_Weight,Upper_Interval_Smiles_Weight,indicator,width_weight_CI
18,0.935137,18.010359,0.965229,0.938804,0.180402,4.678052,0.558646,13.60455,0.7125,0.954167,0.285038,5.310596,0.828206,0.71643,0.898162,False,0.181732
26,1.290059,23.251593,0.954103,0.952712,0.253759,5.577591,0.892806,19.112149,0.829167,0.954167,0.32932,8.64724,0.833305,0.732208,0.911026,False,0.178817


In [153]:
# results_df.mean().reset_index()

In [154]:
# results_df.std()