In [32]:
import tensorflow as tf
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import StandardScaler
from scipy.stats import multivariate_normal as mvn

In [33]:
model_alogp =  tf.keras.models.load_model("..//Base_Models//Model_BOTH")

In [34]:
# read small mol phase 3 features
small_mol_phase_3_features = pd.read_csv("..//Data/smiles_with_rdkit_with_small_phase_3_features.csv")

In [35]:
small_mol_phase_3_preds = model_alogp.predict([small_mol_phase_3_features["Smiles"].values, small_mol_phase_3_features.iloc[:,1:].values], batch_size = 1000)



In [36]:
bottleneck_features = np.load("..//Data//small_mol_phase_3_features_for_both.npy")

In [37]:
bottleneck_features.shape

(959, 64)

In [38]:
smiles_features = bottleneck_features[:, :32]
rdkit_features = bottleneck_features[:, 32:]

In [39]:
smiles_weights = model_alogp.get_weights()[-2][:32, :]
rdkit_weights = model_alogp.get_weights()[-2][32:, :]

In [40]:
final_bias = model_alogp.get_weights()[-1].reshape(1,-1)

In [41]:
smiles_output = smiles_features@smiles_weights

In [42]:
rdkit_output = rdkit_features@rdkit_weights

In [43]:
# pd.read_csv("..//Data//y_train.csv")

In [44]:
y_train = pd.read_csv("..//Data//y_train.csv")
# y_train = pd.read_csv("..//Data/smiles_with_rdkit_with_small_phase_3_outputs.csv").iloc[:,1:]

In [45]:
std_targets = pickle.load(open('..//Data//target_scaler.pkl', 'rb'))

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [46]:
# std_targets = StandardScaler()

In [47]:
y_train = std_targets.transform(y_train)

In [48]:
np.cov(y_train.T)

array([[ 1.00000059, -0.30301738],
       [-0.30301738,  1.00000059]])

In [49]:
np.corrcoef(y_train.T)

array([[ 1.       , -0.3030172],
       [-0.3030172,  1.       ]])

In [50]:
# np.cov(std_targets.inverse_transform(y_train).T)

In [51]:
# weighted_outs = weighted_outs + final_bias

In [52]:
# np.save("..//Data//smiles_0.7_rdkit_0.3_signal.npy", weighted_outs)

In [53]:
covariance =  -0.2*0.9

In [54]:
R_t = np.array([[0.2,  covariance], [covariance, 0.2]])

In [55]:
R_t.shape

(2, 2)

In [56]:
R_t

array([[ 0.2 , -0.18],
       [-0.18,  0.2 ]])

In [57]:
from sklearn.model_selection import train_test_split

In [58]:
def generate_data_copies(bottleneck_features, smiles_output, rdkit_output, weight_smiles, R_t = np.cov(y_train.T), reps = 50):
    # weight_smiles = 0.8
    weight_rdkit = 1-weight_smiles
    # weighted_outs = (weight_smiles*smiles_output + weight_rdkit*rdkit_output) 
    weighted_outs = (weight_smiles*smiles_output + weight_rdkit*rdkit_output) 
    weighted_outs = weighted_outs + final_bias
    error_mean = np.zeros((weighted_outs.shape[1],))
    error_cov = R_t
    catch = []
    for i in range(0, reps):
        mvn_error = mvn(error_mean, error_cov).rvs(weighted_outs.shape[0])
        weighted_outs_with_error = weighted_outs + mvn_error
        combo_y = np.hstack((weighted_outs, weighted_outs_with_error))
        x_train, x_valid, y_train, y_valid = train_test_split(bottleneck_features, combo_y, test_size = 0.25, shuffle = True, 
                                                     random_state = None)
        catch.append([x_train, x_valid, y_train, y_valid])
    return catch

In [59]:
catch = generate_data_copies(bottleneck_features,smiles_output, rdkit_output, 0.8, R_t = R_t, reps = 50)

In [60]:
# catch[0][3]

In [61]:
with open("..//Data//smiles_to_rdkit_80_20_with_cov_minus_0.2_var.pickle", "wb") as f: 
    pickle.dump(catch, f)

In [31]:
# np.save("..//Data//smiles_0.8_rdkit_0.2_signal_plus_noise.npy", weighted_outs_with_error)