In [2]:
# Path to Neural Fingerprint scripts

import sys
sys.path
sys.path.append('../../scripts/baselines/neuralfingerprints/utils')



In [3]:
from sklearn.metrics import r2_score

In [4]:
import os, pickle
import autograd.numpy as np
import autograd.numpy.random as npr
from autograd import grad
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import DrawingOptions
import matplotlib.pyplot as plt

from build_vanilla_net import build_morgan_deep_net, relu, build_standard_net
from build_convnet import build_conv_deep_net, build_convnet_fingerprint_fun
from util import normalize_array, build_batched_grad
from optimizers import adam
from util import rmse
from mol_graph import degrees
from data_util import remove_duplicates


In [6]:
# new function for loading our datasets
def load_data(dataset_path = '../../data/3_final_data/split_data', prefix_name='logP_pH_range_mean', VALUE_COLUMN = 'logP', SMILES_COLUMN='smiles'):
    import pandas as pd
    import os
    from rdkit.Chem import MolFromSmiles
    
    def check_molecules(smiles):
        mol = MolFromSmiles(smiles)
        for atom in mol.GetAtoms():
            if atom.GetDegree() not in [0, 1, 2, 3, 4, 5]:
                with open('../../data/raw/broken_smiles_'+prefix_name+'.txt', 'a') as f:
                    f.write(smiles+'\n')
                return False
        return True
    
    with open('../../data/raw/broken_smiles_'+prefix_name+'.txt', 'w') as f:
        pass
    
    data_splits = ['train', 'test', 'validation']
    
    datasets = {}
    
    for split in data_splits:
        data = pd.read_csv(os.path.join(dataset_path,prefix_name+'_'+split+'.csv'))
#         data = data[data[SMILES_COLUMN].map(check_molecules)]
        datasets[split] = (data[SMILES_COLUMN].values, data[VALUE_COLUMN].values)
        
    
    return datasets

In [31]:
def parse_training_params(params):
    nn_train_params = {'num_epochs'  : num_epochs,
                       'batch_size'  : batch_size,
                       'param_scale' : params['init_scale']}

    vanilla_net_params = {'layer_sizes':[params['fp_length']],  # Linear regression.
                          'normalize':normalize,
                          'L2_reg': params['l2_penalty'],
                          'activation_function':activation}
    return nn_train_params, vanilla_net_params

def train_nn(pred_fun, loss_fun, num_weights, train_smiles, train_raw_targets, train_params,
             validation_smiles=None, validation_raw_targets=None):
    """loss_fun has inputs (weights, smiles, targets)"""
    print "Total number of weights in the network:", num_weights
    npr.seed(0)
    init_weights = npr.randn(num_weights) * train_params['param_scale']

    train_targets, undo_norm = normalize_array(train_raw_targets)
    training_curve = []
    def callback(weights, iter):
        if iter % 10 == 0:
            print "max of weights", np.max(np.abs(weights))
            train_preds = undo_norm(pred_fun(weights, train_smiles))
            cur_loss = loss_fun(weights, train_smiles, train_targets)
            training_curve.append(cur_loss)
            print "Iteration", iter, "loss", cur_loss, "train RMSE", \
                np.sqrt(np.mean((train_preds - train_raw_targets)**2)),
            print "Train R2", iter, ":", \
                    r2_score(train_raw_targets, train_preds),
            if validation_smiles is not None:
                validation_preds = undo_norm(pred_fun(weights, validation_smiles))
                print "Validation RMSE", iter, ":", \
                    np.sqrt(np.mean((validation_preds - validation_raw_targets) ** 2)),
                print "Validation R2", iter, ":", \
                    r2_score(validation_raw_targets, validation_preds),
            dub_preds = undo_norm(pred_fun(weights, dub_smiles))
            uniq_preds = undo_norm(pred_fun(weights, uniq_smiles))
            print "Dub RMSE", iter, ":", rmse(dub_preds, dub_targets)
            print "Unique RMSE", iter, ":", rmse(uniq_preds,  uniq_targets)
            print "Dub R2", iter, ":", r2_score(dub_targets, dub_preds)
            print "Unique R2", iter, ":", r2_score(uniq_targets, uniq_preds)

    grad_fun = grad(loss_fun)
    grad_fun_with_data = build_batched_grad(grad_fun, train_params['batch_size'],
                                            train_smiles, train_targets)

    num_iters = train_params['num_epochs'] * len(train_smiles) / train_params['batch_size']
    trained_weights = adam(grad_fun_with_data, init_weights, callback=callback,
                           num_iters=num_iters)
    
    def predict_func(new_smiles):
        """Returns to the original units that the raw targets were in."""
        return undo_norm(pred_fun(trained_weights, new_smiles))
    return predict_func, trained_weights, training_curve


def train_neural_fingerprint():
    print "Loading data..."
    data = load_data(prefix_name = task_params['data_file'], VALUE_COLUMN = task_params['target_name'])

    train_inputs, train_targets = data['train']
    val_inputs,   val_targets   = data['validation']
    test_inputs,  test_targets  = data['test']

    print "Regression on", len(train_inputs), "training points."
    def print_performance(pred_func):
        train_preds = pred_func(train_inputs)
        test_preds = pred_func(test_inputs)
        dub_preds = pred_func(dub_smiles)
        uniq_preds = pred_func(uniq_smiles)
        print "\nPerformance (RMSE) on " + task_params['target_name'] + ":"
        print "Train:", rmse(train_preds, train_targets)
        print "Test: ", rmse(test_preds,  test_targets)
        print "Dub:", rmse(dub_preds, dub_targets)
        print "Unique: ", rmse(uniq_preds,  uniq_targets)
        
        print "\nPerformance (R2) on " + task_params['target_name'] + ":"
        print "Train:", r2_score(train_targets, train_preds)
        print "Test: ", r2_score(test_targets, test_preds)
        print "Dub:", r2_score(dub_targets, dub_preds)
        print "Unique: ", r2_score(uniq_targets, uniq_preds)
        print "-" * 80
        return rmse(test_preds,  test_targets)

    print "-" * 80
    print "Mean predictor"
    y_train_mean = np.mean(train_targets)
    print_performance(lambda x : y_train_mean*np.ones(len(x)))

    print "Task params", params
    nn_train_params, vanilla_net_params = parse_training_params(params)
    conv_arch_params['return_atom_activations'] = False

    print "Convnet fingerprints with neural net"
    loss_fun, pred_fun, conv_parser = \
        build_conv_deep_net(conv_arch_params, vanilla_net_params, params['l2_penalty'])
    num_weights = len(conv_parser)
    predict_func, trained_weights, conv_training_curve = \
         train_nn(pred_fun, loss_fun, num_weights, train_inputs, train_targets,
                 nn_train_params, validation_smiles=val_inputs, validation_raw_targets=val_targets)
    print_performance(predict_func)
    return trained_weights


def draw_molecule_with_highlights(filename, smiles, highlight_atoms):
    drawoptions = DrawingOptions()
    drawoptions.selectColor = highlight_color
    drawoptions.elemDict = {}   # Don't color nodes based on their element.
    drawoptions.bgColor=None

    mol = Chem.MolFromSmiles(smiles)
    fig = Draw.MolToMPL(mol, highlightAtoms=highlight_atoms, size=figsize, options=drawoptions,fitImage=False)

    fig.gca().set_axis_off()
    fig.savefig(filename, bbox_inches='tight')
    plt.close(fig)


def construct_atom_neighbor_list(array_rep):
    atom_neighbour_list = []
    for degree in degrees:
        atom_neighbour_list += [list(neighbours) for neighbours in array_rep[('atom_neighbors', degree)]]
    return atom_neighbour_list


def plot(trained_weights, FIGURE_PATH = '../../data/raw'):
    
    print "Loading data..."
    data = load_data(prefix_name = task_params['data_file'], VALUE_COLUMN = task_params['target_name'])

    train_smiles, train_targets = data['train']
    val_inputs,   val_targets   = data['validation']
    test_inputs,  test_targets  = data['test']

    print "Convnet fingerprints with neural net"
    conv_arch_params['return_atom_activations'] = True
    output_layer_fun, parser, compute_atom_activations = \
       build_convnet_fingerprint_fun(**conv_arch_params)
    atom_activations, array_rep = compute_atom_activations(trained_weights, train_smiles)

    if not os.path.exists(os.path.join(FIGURE_PATH, 'figures')): os.makedirs(os.path.join(FIGURE_PATH, 'figures'))

    parent_molecule_dict = {}
    for mol_ix, atom_ixs in enumerate(array_rep['atom_list']):
        for atom_ix in atom_ixs:
            parent_molecule_dict[atom_ix] = mol_ix

    atom_neighbor_list = construct_atom_neighbor_list(array_rep)

    def get_neighborhood_ixs(array_rep, cur_atom_ix, radius):
        # Recursive function to get indices of all atoms in a certain radius.
        if radius == 0:
            return set([cur_atom_ix])
        else:
            cur_set = set([cur_atom_ix])
            for n_ix in atom_neighbor_list[cur_atom_ix]:
                cur_set.update(get_neighborhood_ixs(array_rep, n_ix, radius-1))
            return cur_set

    # Recreate trained network.
    nn_train_params, vanilla_net_params = parse_training_params(params)
    conv_arch_params['return_atom_activations'] = False
    _, _, combined_parser = \
        build_conv_deep_net(conv_arch_params, vanilla_net_params, params['l2_penalty'])

    net_loss_fun, net_pred_fun, net_parser = build_standard_net(**vanilla_net_params)
    net_weights = combined_parser.get(trained_weights, 'net weights')
    last_layer_weights = net_parser.get(net_weights, ('weights', 0))

    for fp_ix in range(params['fp_length']):
        print "FP {0} has linear regression coefficient {1}".format(fp_ix, last_layer_weights[fp_ix][0])
        combined_list = []
        for radius in all_radii:
            fp_activations = atom_activations[radius][:, fp_ix]
            combined_list += [(fp_activation, atom_ix, radius) for atom_ix, fp_activation in enumerate(fp_activations)]

        unique_list = remove_duplicates(combined_list, key_lambda=lambda x: x[0])
        combined_list = sorted(unique_list, key=lambda x: -x[0])

        for fig_ix in range(num_figs_per_fp):
            # Find the most-activating atoms for this fingerprint index, across all molecules and depths.
            activation, most_active_atom_ix, cur_radius = combined_list[fig_ix]
            most_activating_mol_ix = parent_molecule_dict[most_active_atom_ix]
            highlight_list_our_ixs = get_neighborhood_ixs(array_rep, most_active_atom_ix, cur_radius)
            highlight_list_rdkit = [array_rep['rdkit_ix'][our_ix] for our_ix in highlight_list_our_ixs]

            print "radius:", cur_radius, "atom list:", highlight_list_rdkit, "activation", activation
            draw_molecule_with_highlights(\
                os.path.join(FIGURE_PATH, 'figures',"fp_{0}_highlight_{1}.pdf".format(fp_ix, fig_ix)),
                train_smiles[most_activating_mol_ix],
                highlight_atoms=highlight_list_rdkit)

In [12]:
def get_averaged_and_unique_smiles(dataset_name, \
                                   datasets_path = "../../data/3_final_data", \
                                   not_averaged_dataset_name = 'logP.csv', \
                                   SMILES_COLUMN = 'smiles',\
                                   VALUE_COLUMN = 'logP'):
    """get smiles and logP with unique and duplicated measurements"""
    logP_dataset = pd.read_csv(os.path.join(datasets_path, not_averaged_dataset_name))
    dataset = pd.read_csv(os.path.join(datasets_path, dataset_name))
    
    duplicates_smiles = logP_dataset.groupby([SMILES_COLUMN]).count()
    
    smiles_dub = list(duplicates_smiles[duplicates_smiles[VALUE_COLUMN]>1].index)   
    smiles_uniq = list(duplicates_smiles[duplicates_smiles[VALUE_COLUMN]==1].index)  
    
    dub_data = dataset[dataset[SMILES_COLUMN].isin(smiles_dub)]
    uniq_data = dataset[dataset[SMILES_COLUMN].isin(smiles_uniq)]
    
    dub_smiles, dub_targets = list(dub_data[SMILES_COLUMN]), list(dub_data[VALUE_COLUMN])
    uniq_smiles, uniq_targets = list(uniq_data[SMILES_COLUMN]), list(uniq_data[VALUE_COLUMN])
    
    return dub_smiles, dub_targets, uniq_smiles, uniq_targets

## logP_mean

### Visualization

In [27]:
task_params = {'target_name' : 'logP',
               'data_file'   : 'logp_mean'}

num_epochs = 1
batch_size = 100
normalize = 1
dropout = 0
activation = relu
params = {'fp_length': 20,
            'fp_depth': 3,
            'init_scale':np.exp(-4),
            'learn_rate':np.exp(-4),
                    'b1':np.exp(-4),
                    'b2':np.exp(-4),
            'l2_penalty':np.exp(-4),
            'l1_penalty':np.exp(-5),
            'conv_width':10}

conv_layer_sizes = [params['conv_width']] * params['fp_depth']
conv_arch_params = {'num_hidden_features' : conv_layer_sizes,
                    'fp_length' : params['fp_length'],
                    'normalize' : normalize,
                    'return_atom_activations':False}

all_radii = range(params['fp_depth'] + 1)

# Plotting parameters
num_figs_per_fp = 11
figsize = (100, 100)
highlight_color = (30.0/255.0, 100.0/255.0, 255.0/255.0)  # A nice light blue.

In [32]:
trained_network_weights = train_neural_fingerprint()
with open('results.pkl', 'w') as f:
    pickle.dump(trained_network_weights, f)

# Plotting.
with open('results.pkl') as f:
    trained_weights = pickle.load(f)
plot(trained_weights)

Loading data...
Regression on 9631 training points.
--------------------------------------------------------------------------------
Mean predictor

Performance (RMSE) on logP:
Train: 1.8591219922763682
Test:  1.9087278157747403
Dub: 2.178198827192713
Unique:  1.8030779775623207

Performance (R2) on logP:
Train: 0.0
Test:  -1.8278865013598988e-05
Dub: -0.004600498132678821
Unique:  -0.0018443796611973262
--------------------------------------------------------------------------------
Task params {'learn_rate': 0.01831563888873418, 'fp_depth': 3, 'b1': 0.01831563888873418, 'b2': 0.01831563888873418, 'init_scale': 0.01831563888873418, 'fp_length': 20, 'l2_penalty': 0.01831563888873418, 'l1_penalty': 0.006737946999085467, 'conv_width': 10}
Convnet fingerprints with neural net
Total number of weights in the network: 9891
max of weights 0.06962983567500523
Iteration 0 loss 1.0015408196365463 train RMSE 1.8605440963555582 Train R2 0 : -0.001530451462992577 Validation RMSE 0 : 1.8694340847349

radius: 3 atom list: [7, 1, 3, 6, 0, 2, 4, 5, 8] activation 0.09927388497075428
radius: 3 atom list: [4, 7, 10, 11, 14, 9, 15, 8, 12, 13, 16] activation 0.09926514529688635
radius: 3 atom list: [1, 2, 4, 6, 8, 10, 3, 5, 7, 9, 11] activation 0.09923963651257624
radius: 3 atom list: [0, 2, 32, 33, 1, 3, 5, 4, 31] activation 0.09913396014719324
FP 4 has linear regression coefficient -0.0370703061898
radius: 3 atom list: [0, 3, 5, 4, 6, 8, 7, 2, 1, 10, 9] activation 0.09925203115321078
radius: 3 atom list: [5, 7, 9, 1, 4, 6, 8, 2, 10, 0, 3] activation 0.09895188280117369
radius: 3 atom list: [34, 30, 33, 35, 41, 43, 45, 42, 44, 46, 47, 32] activation 0.09753845017580777
radius: 3 atom list: [2, 0, 3, 5, 6, 1, 4] activation 0.09695303982831739
radius: 3 atom list: [7, 10, 13, 14, 9, 12, 11, 16, 15] activation 0.09687574117479231
radius: 3 atom list: [1, 2, 4, 6, 3, 5, 16, 17, 18] activation 0.09662608032070982
radius: 3 atom list: [4, 7, 10, 11, 14, 9, 15, 8, 12, 13, 16] activation 0.096339

radius: 3 atom list: [4, 7, 10, 11, 14, 9, 15, 8, 12, 13, 16] activation 0.09708742891611805
radius: 3 atom list: [1, 2, 4, 6, 8, 10, 0, 3, 5, 7, 9] activation 0.09707213774448331
radius: 3 atom list: [7, 1, 3, 6, 0, 2, 4, 5, 8] activation 0.09703193435640178
radius: 3 atom list: [0, 2, 32, 33, 1, 3, 5, 4, 31] activation 0.09692153302924106
radius: 3 atom list: [1, 2, 4, 6, 8, 10, 3, 5, 7, 9, 11] activation 0.09691820045912501
FP 13 has linear regression coefficient -0.019693963584
radius: 3 atom list: [0, 3, 5, 4, 6, 8, 7, 2, 1, 10, 9] activation 0.09686448990023876
radius: 3 atom list: [5, 7, 9, 1, 4, 6, 8, 2, 10, 0, 3] activation 0.09661971678910115
radius: 3 atom list: [34, 30, 33, 35, 41, 43, 45, 42, 44, 46, 47, 32] activation 0.09546331675683599
radius: 3 atom list: [2, 0, 3, 5, 6, 1, 4] activation 0.09494148814455165
radius: 3 atom list: [7, 10, 13, 14, 9, 12, 11, 16, 15] activation 0.09489225966481231
radius: 3 atom list: [1, 2, 4, 6, 3, 5, 16, 17, 18] activation 0.094680271918

In [33]:
task_params = {'target_name' : 'logP',
               'data_file'   : 'logp_mean'}

num_epochs = 5
batch_size = 100
normalize = 1
dropout = 0
activation = relu
params = {'fp_length': 50,
            'fp_depth': 4,
            'init_scale':np.exp(-4),
            'learn_rate':np.exp(-4),
                    'b1':np.exp(-4),
                    'b2':np.exp(-4),
            'l2_penalty':np.exp(-4),
            'l1_penalty':np.exp(-5),
            'conv_width':20}

conv_layer_sizes = [params['conv_width']] * params['fp_depth']
conv_arch_params = {'num_hidden_features' : conv_layer_sizes,
                    'fp_length' : params['fp_length'],
                    'normalize' : normalize,
                    'return_atom_activations':False}

all_radii = range(params['fp_depth'] + 1)

# Plotting parameters
num_figs_per_fp = 11
figsize = (100, 100)
highlight_color = (30.0/255.0, 100.0/255.0, 255.0/255.0)  # A nice light blue.

In [34]:
trained_network_weights = train_neural_fingerprint()
with open('results.pkl', 'w') as f:
    pickle.dump(trained_network_weights, f)

# Plotting.
with open('results.pkl') as f:
    trained_weights = pickle.load(f)
plot(trained_weights)

Loading data...
Regression on 9631 training points.
--------------------------------------------------------------------------------
Mean predictor

Performance (RMSE) on logP:
Train: 1.8591219922763682
Test:  1.9087278157747403
Dub: 2.178198827192713
Unique:  1.8030779775623207

Performance (R2) on logP:
Train: 0.0
Test:  -1.8278865013598988e-05
Dub: -0.004600498132678821
Unique:  -0.0018443796611973262
--------------------------------------------------------------------------------
Task params {'learn_rate': 0.01831563888873418, 'fp_depth': 4, 'b1': 0.01831563888873418, 'b2': 0.01831563888873418, 'init_scale': 0.01831563888873418, 'fp_length': 50, 'l2_penalty': 0.01831563888873418, 'l1_penalty': 0.006737946999085467, 'conv_width': 20}
Convnet fingerprints with neural net
Total number of weights in the network: 30571
max of weights 0.08535001578936458
Iteration 0 loss 1.0049397981323287 train RMSE 1.86369714187141 Train R2 0 : -0.004927895158615581 Validation RMSE 0 : 1.85822046516658

max of weights 0.2799464150019621
Iteration 210 loss 0.2915834895508801 train RMSE 1.003618207154074 Train R2 210 : 0.7085787033094347 Validation RMSE 210 : 0.9824098394676521 Validation R2 210 : 0.7194327085175308 Dub RMSE 210 : 1.0318291807552724
Unique RMSE 210 : 0.9712784240065011
Dub R2 210 : 0.774568832063946
Unique R2 210 : 0.7092908301688576
max of weights 0.2870569164738435
Iteration 220 loss 0.29272653611032734 train RMSE 1.0055798135722693 Train R2 220 : 0.7074384040591195 Validation RMSE 220 : 0.9869747301019479 Validation R2 220 : 0.7168192684778716 Dub RMSE 220 : 1.0411395772038996
Unique RMSE 220 : 0.9781808088422429
Dub R2 220 : 0.7704822588837228
Unique R2 220 : 0.705144302429876
max of weights 0.2931865823105432
Iteration 230 loss 0.28310939381582834 train RMSE 0.9889094064375857 Train R2 230 : 0.7170581172358654 Validation RMSE 230 : 0.9640463592153821 Validation R2 230 : 0.7298235629315631 Dub RMSE 230 : 1.0338277883490294
Unique RMSE 230 : 0.945969083386725
Dub R2 

radius: 2 atom list: [3, 5, 13, 6, 9, 11, 7, 8] activation 0.040953470742095516
radius: 2 atom list: [12, 14, 15, 16, 17, 11, 13, 18] activation 0.040509755723145253
radius: 2 atom list: [3, 6, 2, 5, 7, 4, 11, 12] activation 0.04048103278517118
radius: 2 atom list: [8, 10, 4, 7, 3, 6, 2, 5] activation 0.040292644406531755
radius: 2 atom list: [6, 8, 9, 10, 7, 11, 4, 5] activation 0.040282857321568784
radius: 2 atom list: [14, 15, 1, 8, 10, 12, 11, 13] activation 0.039802530138448984
radius: 2 atom list: [9, 10, 11, 7, 8, 1, 4, 6] activation 0.03976206299011197
radius: 2 atom list: [10, 1, 4, 6, 11, 8, 7, 9] activation 0.03968459272300808
radius: 2 atom list: [13, 11, 12, 14, 17, 15, 16] activation 0.03957228976631147
radius: 2 atom list: [4, 6, 7, 1, 3, 5, 8] activation 0.03954493304756484
radius: 2 atom list: [5, 8, 4, 6, 7, 1, 3] activation 0.039447733270060834
FP 8 has linear regression coefficient 0.0498027384253
radius: 2 atom list: [0, 2, 4, 6, 1, 3, 5, 7] activation 0.0465048726

radius: 1 atom list: [7, 8, 9, 10, 11] activation 0.03222850036669252
radius: 1 atom list: [13, 14, 15, 9, 12] activation 0.03217018601750053
radius: 1 atom list: [10, 8, 11, 7, 9] activation 0.03202898468512692
radius: 1 atom list: [9, 10, 11, 8, 12] activation 0.03196617010961869
radius: 1 atom list: [4, 6, 5, 3, 10] activation 0.03135502464548764
radius: 1 atom list: [4, 5, 6, 7, 8] activation 0.03127855301432458
radius: 1 atom list: [0, 2, 3, 4, 1] activation 0.031216595966351984
radius: 1 atom list: [3, 1, 4, 0, 2] activation 0.031215489352381717
FP 17 has linear regression coefficient -0.0530127645214
radius: 2 atom list: [1, 3, 4, 0, 2] activation 0.047969597203528044
radius: 2 atom list: [3, 4, 0, 2, 1] activation 0.047921494822552896
radius: 2 atom list: [13, 16, 14, 15, 12] activation 0.04781644796695214
radius: 2 atom list: [1, 0, 2, 3, 4] activation 0.04770133703583243
radius: 2 atom list: [2, 1, 3, 4, 0] activation 0.047591382821993655
radius: 2 atom list: [16, 17, 15, 12,

radius: 2 atom list: [6, 1, 4, 5] activation 0.04657352641617795
radius: 2 atom list: [3, 5, 13, 6, 9, 11, 7, 8] activation 0.04625786511587027
radius: 2 atom list: [5, 4, 6, 7, 8] activation 0.04621138057974475
FP 26 has linear regression coefficient -0.037356098093
radius: 2 atom list: [12, 11, 15, 13, 14, 16] activation 0.033965294977200036
radius: 2 atom list: [3, 1, 5, 0, 2, 4] activation 0.0338462970365178
radius: 2 atom list: [3, 5, 6, 8, 4, 7] activation 0.03354415100366281
radius: 2 atom list: [3, 2, 1, 4, 0] activation 0.03339256965607653
radius: 2 atom list: [4, 11, 0, 3, 2, 1] activation 0.03329323990164905
radius: 2 atom list: [2, 0, 1, 4, 3] activation 0.03328789510960107
radius: 2 atom list: [4, 1, 3, 0, 2] activation 0.033187444201085825
radius: 2 atom list: [1, 3, 5, 4, 6] activation 0.03317284756937457
radius: 2 atom list: [2, 4, 6, 5, 7] activation 0.033013481554705786
radius: 2 atom list: [2, 0, 3, 1, 4] activation 0.03283772637372656
radius: 1 atom list: [0, 1] act

radius: 2 atom list: [3, 5, 13, 6, 9, 11, 7, 8] activation 0.046077564206234996
radius: 2 atom list: [0, 2, 4, 6, 1, 3, 5, 7] activation 0.04587008924137514
radius: 2 atom list: [4, 8, 18, 14, 5, 6, 7, 15, 16, 17] activation 0.04563372593799655
radius: 2 atom list: [2, 4, 6, 7, 8, 5, 16, 18, 20, 19] activation 0.045533968949391906
radius: 2 atom list: [3, 6, 2, 5, 7, 4, 11, 12] activation 0.045478889857132665
radius: 2 atom list: [12, 14, 15, 16, 17, 11, 13, 18] activation 0.045475338721004864
radius: 2 atom list: [7, 6, 9, 16, 18, 8, 19, 1, 3, 5] activation 0.045088280003211134
radius: 2 atom list: [8, 10, 4, 7, 3, 6, 2, 5] activation 0.04508012210448425
radius: 2 atom list: [10, 12, 13, 11] activation 0.04502261216196935
radius: 2 atom list: [1, 4, 6, 7, 8, 14, 16, 3, 5, 17] activation 0.04498752916578476
radius: 2 atom list: [2, 6, 14, 3, 4, 5, 7, 11, 13, 15] activation 0.04498351312551987
FP 36 has linear regression coefficient 0.0867025466438
radius: 4 atom list: [12, 13, 14, 15, 

radius: 4 atom list: [20, 23, 26, 29, 31, 25, 30, 27, 28, 21, 19] activation 0.05786793733128262
radius: 4 atom list: [18, 19, 20, 22, 15, 16, 21, 23, 13, 17] activation 0.057861008671463585
radius: 4 atom list: [18, 20, 15, 13, 14, 19, 21, 22, 16, 17] activation 0.05785832194323812
radius: 4 atom list: [5, 2, 1, 4, 0, 3] activation 0.057845346165469984
radius: 4 atom list: [15, 12, 17, 18, 11, 16, 14, 13] activation 0.05775992025407088
FP 45 has linear regression coefficient 0.0504047010561
radius: 2 atom list: [0, 2, 4, 6, 1, 3, 5, 7] activation 0.04914246708946112
radius: 2 atom list: [4, 8, 18, 14, 5, 6, 7, 15, 16, 17] activation 0.048774904791333416
radius: 2 atom list: [2, 4, 6, 7, 8, 5, 16, 18, 20, 19] activation 0.04847990166910043
radius: 2 atom list: [7, 6, 9, 16, 18, 8, 19, 1, 3, 5] activation 0.04788460009867911
radius: 2 atom list: [1, 4, 6, 7, 8, 14, 16, 3, 5, 17] activation 0.04775450158015263
radius: 2 atom list: [2, 6, 14, 3, 4, 5, 7, 11, 13, 15] activation 0.047749106

### Compare error for SMILES with duplicates and without

In [21]:
task_params = {'target_name' : 'logP',
               'data_file'   : 'logp_mean'}

num_epochs = 1
batch_size = 100
normalize = 1
dropout = 0
activation = relu
params = {'fp_length': 20,
            'fp_depth': 3,
            'init_scale':np.exp(-4),
            'l2_penalty':np.exp(-2),
            'conv_width':10}

conv_layer_sizes = [params['conv_width']] * params['fp_depth']
conv_arch_params = {'num_hidden_features' : conv_layer_sizes,
                    'fp_length' : params['fp_length'],
                    'normalize' : normalize,
                    'return_atom_activations':False}

all_radii = range(params['fp_depth'] + 1)

# Plotting parameters
num_figs_per_fp = 11
figsize = (100, 100)
highlight_color = (30.0/255.0, 100.0/255.0, 255.0/255.0)  # A nice light blue.

In [22]:
dub_smiles, dub_targets, uniq_smiles, uniq_targets =  get_averaged_and_unique_smiles(task_params['data_file']+'.csv')
trained_network_weights = train_neural_fingerprint()

Loading data...
Regression on 9631 training points.
--------------------------------------------------------------------------------
Mean predictor

Performance (RMSE) on logP:
Train: 1.8591219922763682
Test:  1.9087278157747403
Dub: 2.196473320235497
Unique:  1.8589662274903223

Performance (R2) on logP:
Train: 0.0
Test:  -1.8278865013598988e-05
Dub: -0.004552022728161242
Unique:  -1.558775978693916e-05
--------------------------------------------------------------------------------
Task params {'fp_length': 20, 'l2_penalty': 0.1353352832366127, 'fp_depth': 3, 'conv_width': 10, 'init_scale': 0.01831563888873418}
Convnet fingerprints with neural net
Total number of weights in the network: 9891
max of weights 0.06962983567500523
Iteration 0 loss 1.0016070624790232 train RMSE 1.8605440963555582 Train R2 0 : -0.001530451462992577 Validation RMSE 0 : 1.8694340847349085 Validation R2 0 : -0.015949090875271965 Dub RMSE 0 : 2.1786428990626363
Unique RMSE 0 : 1.8633197123142058
Dub R2 0 : 0.01

## logP_wo_parameters

### Visualization

In [35]:
task_params = {'target_name' : 'logP',
               'data_file'   : 'logP_wo_parameters'}

num_epochs = 5
batch_size = 100
normalize = 1
dropout = 0
activation = relu
params = {'fp_length': 50,
            'fp_depth': 4,
            'init_scale':np.exp(-4),
            'l2_penalty':np.exp(-2),
            'conv_width':20}

conv_layer_sizes = [params['conv_width']] * params['fp_depth']
conv_arch_params = {'num_hidden_features' : conv_layer_sizes,
                    'fp_length' : params['fp_length'],
                    'normalize' : normalize,
                    'return_atom_activations':False}

all_radii = range(params['fp_depth'] + 1)

# Plotting parameters
num_figs_per_fp = 11
figsize = (100, 100)
highlight_color = (30.0/255.0, 100.0/255.0, 255.0/255.0)  # A nice light blue.

In [36]:
trained_network_weights = train_neural_fingerprint()
with open('results.pkl', 'w') as f:
    pickle.dump(trained_network_weights, f)

# Plotting.
with open('results.pkl') as f:
    trained_weights = pickle.load(f)
plot(trained_weights)

Loading data...
Regression on 8838 training points.
--------------------------------------------------------------------------------
Mean predictor

Performance (RMSE) on logP:
Train: 1.8146042407352028
Test:  1.8141849992522239
Dub: 2.1832152392353295
Unique:  1.8014880653973313

Performance (R2) on logP:
Train: 0.0
Test:  -0.0002990500750039704
Dub: -0.009233034859763123
Unique:  -7.835254551791238e-05
--------------------------------------------------------------------------------
Task params {'fp_length': 50, 'l2_penalty': 0.1353352832366127, 'fp_depth': 4, 'conv_width': 20, 'init_scale': 0.01831563888873418}
Convnet fingerprints with neural net
Total number of weights in the network: 30571
max of weights 0.08535001578936458
Iteration 0 loss 1.0041956952405227 train RMSE 1.8183273856100233 Train R2 0 : -0.004107743500010308 Validation RMSE 0 : 1.789294284026002 Validation R2 0 : -0.0034116315193561952 Dub RMSE 0 : 2.1911715822316413
Unique RMSE 0 : 1.8048158473406302
Dub R2 0 : -0.

Iteration 440 loss 0.1918334863355846 train RMSE 0.7916566735776115 Train R2 440 : 0.8096688325990715 Validation RMSE 440 : 0.7999606324212283 Validation R2 440 : 0.7994360787163144 Dub RMSE 440 : 0.9136218989966348
Unique RMSE 440 : 0.7981911872944559
Dub R2 440 : 0.8232614241840815
Unique R2 440 : 0.8036707059025162

Performance (RMSE) on logP:
Train: 0.7882765129488928
Test:  0.8399708726205302
Dub: 0.9039245630138911
Unique:  0.7956045013820937

Performance (R2) on logP:
Train: 0.8112906883214202
Test:  0.7855652499189894
Dub: 0.8269933784575323
Unique:  0.8049411266965805
--------------------------------------------------------------------------------
Loading data...
Convnet fingerprints with neural net
FP 0 has linear regression coefficient -0.0344252496511
radius: 1 atom list: [13, 15, 16, 17, 18, 14] activation 0.0313388872803712
radius: 1 atom list: [8, 9] activation 0.03093855947174184
radius: 1 atom list: [3, 0, 2, 1] activation 0.030921512976782688
radius: 1 atom list: [14,

radius: 2 atom list: [4, 16, 2, 17, 3, 5, 6, 7, 13, 15] activation 0.04259826826381979
radius: 2 atom list: [8, 5, 12, 14, 2, 4, 6, 7, 13, 15] activation 0.042330277966307533
radius: 2 atom list: [16, 10, 5, 7, 8, 9, 13, 15, 11, 17] activation 0.0423042897806678
radius: 2 atom list: [17, 2, 18, 3, 5, 6, 7, 14, 16, 4] activation 0.04221466834453979
radius: 2 atom list: [8, 9, 10, 11, 7] activation 0.04207948179004244
radius: 2 atom list: [4, 2, 0, 1, 3] activation 0.041899955565856406
radius: 2 atom list: [7, 8, 9, 10, 11] activation 0.04184039830674033
FP 9 has linear regression coefficient -0.0588043182796
radius: 4 atom list: [6, 7, 0, 2, 3, 1, 5, 4, 8, 9] activation 0.0538680650648846
radius: 4 atom list: [3, 6, 7, 8, 1, 4, 0, 2, 5] activation 0.053819608776944125
radius: 4 atom list: [1, 2, 3, 0, 4] activation 0.05381705158116561
radius: 4 atom list: [3, 1, 4, 7, 15, 8, 0, 2, 5, 6] activation 0.05381700335095583
radius: 4 atom list: [62, 60, 2, 3, 4, 59, 1, 0, 61] activation 0.0538

FP 18 has linear regression coefficient -0.0566370070571
radius: 4 atom list: [14, 5, 8, 17, 15, 16, 18, 19, 6, 7, 9] activation 0.047519148031711214
radius: 4 atom list: [10, 11, 12, 5, 6, 9, 13, 14, 15, 7, 8] activation 0.047518435424660506
radius: 4 atom list: [4, 0, 1, 2, 3, 5, 6, 14, 15] activation 0.047516670841201446
radius: 4 atom list: [0, 1, 2, 3, 4, 5] activation 0.047514240916352035
radius: 4 atom list: [0, 1, 2, 3, 6, 13, 14, 4, 5] activation 0.04751148743392476
radius: 4 atom list: [50, 44, 45, 46, 47, 48, 49] activation 0.04751073588883061
radius: 4 atom list: [7, 2, 6, 11, 1, 3, 8, 9, 10] activation 0.047510507734301356
radius: 4 atom list: [2, 0, 15, 3, 12, 13, 14, 1] activation 0.047510214528214696
radius: 4 atom list: [0, 14, 1, 3, 2, 11, 12, 13] activation 0.0475093822661178
radius: 4 atom list: [4, 5, 6, 0, 3, 1, 2, 8, 7] activation 0.04750905451278071
radius: 4 atom list: [2, 3, 4, 5, 0, 1] activation 0.047508443176458935
FP 19 has linear regression coefficient 0.

radius: 2 atom list: [4, 16, 2, 17, 3, 5, 6, 7, 13, 15] activation 0.04103444337850793
radius: 2 atom list: [8, 5, 12, 14, 2, 4, 6, 7, 13, 15] activation 0.04080793505571921
radius: 2 atom list: [16, 10, 5, 7, 8, 9, 13, 15, 11, 17] activation 0.040786457460417404
radius: 2 atom list: [17, 2, 18, 3, 5, 6, 7, 14, 16, 4] activation 0.040710128032307856
radius: 2 atom list: [4, 6, 7, 13, 5, 14, 3, 8, 12, 1] activation 0.04036995750173401
radius: 2 atom list: [16, 3, 5, 6, 13, 15, 2, 7, 17, 4] activation 0.04034765474524912
radius: 2 atom list: [8, 9, 10, 11, 7] activation 0.04010627661556169
FP 28 has linear regression coefficient 0.0117177058721
radius: 0 atom list: [7] activation 0.021051218351482034
radius: 0 atom list: [1] activation 0.02040506103953964
radius: 0 atom list: [2] activation 0.020276048958243757
radius: 1 atom list: [6, 8, 10, 9] activation 0.019790083489879304
radius: 1 atom list: [4, 5, 2, 3] activation 0.019690984998490535
radius: 1 atom list: [10, 20, 11, 2] activatio

radius: 4 atom list: [8, 4, 9, 13, 3, 5, 1, 6, 7, 10, 11, 12] activation 0.16543107520626618
radius: 4 atom list: [10, 4, 9, 11, 6, 2, 1, 0, 3, 5, 7, 8] activation 0.16442106740699855
radius: 4 atom list: [16, 17, 19, 20, 12, 18, 21, 11, 15, 10, 13, 14] activation 0.16431588098049046
radius: 4 atom list: [10, 13, 12, 6, 7, 8, 9, 11, 14, 15, 16, 17, 18] activation 0.16259644118384659
radius: 4 atom list: [5, 6, 1, 2, 3, 0, 7, 10, 14, 4, 8, 9, 11, 12, 13] activation 0.16161491351066035
radius: 4 atom list: [0, 6, 8, 12, 14, 16, 18, 20, 2, 1, 3, 4, 5, 7, 9, 11, 13, 15, 17, 19] activation 0.15923629593383132
FP 37 has linear regression coefficient -0.0815410189956
radius: 4 atom list: [1, 0, 3, 4, 2] activation 0.16048375344839882
radius: 4 atom list: [1, 0, 3, 4, 2] activation 0.1597530708166684
radius: 4 atom list: [3, 6, 10, 5, 0, 2, 4, 1] activation 0.15408853577083106
radius: 4 atom list: [12, 13, 14, 15, 16, 17, 18, 11] activation 0.1504633188001129
radius: 4 atom list: [7, 10, 6, 11

radius: 2 atom list: [1, 0, 3, 4, 2] activation 0.04508959823519077
radius: 2 atom list: [3, 4, 0, 2, 1] activation 0.0448239806105627
radius: 2 atom list: [0, 2, 1, 3, 4] activation 0.04461977785963227
radius: 2 atom list: [2, 1, 4, 3, 0] activation 0.04457701743175387
radius: 2 atom list: [1, 0, 3, 4, 2] activation 0.04445078868742134
radius: 2 atom list: [5, 3, 0, 2, 1, 4] activation 0.0443803862788628
radius: 2 atom list: [1, 5, 4, 0, 2, 3] activation 0.04435511289518074
radius: 2 atom list: [1, 5, 4, 0, 2, 3] activation 0.04428968497608033
radius: 2 atom list: [3, 1, 4, 0, 2] activation 0.04398365224406065
radius: 2 atom list: [5, 7, 9, 6, 8, 4] activation 0.043971410867491366
radius: 2 atom list: [1, 0, 2, 4, 3] activation 0.04393858192846713
FP 47 has linear regression coefficient 0.000821820606705
radius: 4 atom list: [0, 5, 1, 4, 6, 11, 2, 3, 7, 8, 9, 10] activation 0.017476365103158913
radius: 4 atom list: [0, 4, 5, 10, 2, 7, 8, 11, 13, 1, 6, 9, 12, 3] activation 0.0174756994

### Compare error for SMILES with duplicates and without

In [38]:
task_params = {'target_name' : 'logP',
               'data_file'   : 'logP_wo_parameters'}

num_epochs = 1
batch_size = 100
normalize = 1
dropout = 0
activation = relu
params = {'fp_length': 20,
            'fp_depth': 3,
            'init_scale':np.exp(-4),
            'l2_penalty':np.exp(-2),
            'conv_width':10}

conv_layer_sizes = [params['conv_width']] * params['fp_depth']
conv_arch_params = {'num_hidden_features' : conv_layer_sizes,
                    'fp_length' : params['fp_length'],
                    'normalize' : normalize,
                    'return_atom_activations':False}

all_radii = range(params['fp_depth'] + 1)

# Plotting parameters
num_figs_per_fp = 11
figsize = (100, 100)
highlight_color = (30.0/255.0, 100.0/255.0, 255.0/255.0)  # A nice light blue.

In [39]:
dub_smiles, dub_targets, uniq_smiles, uniq_targets =  get_averaged_and_unique_smiles(task_params['data_file']+'.csv')
trained_network_weights = train_neural_fingerprint()

Loading data...
Regression on 8838 training points.
--------------------------------------------------------------------------------
Mean predictor

Performance (RMSE) on logP:
Train: 1.8146042407352028
Test:  1.8141849992522239
Dub: 2.1832152392353295
Unique:  1.8014880653973313

Performance (R2) on logP:
Train: 0.0
Test:  -0.0002990500750039704
Dub: -0.009233034859763123
Unique:  -7.835254551791238e-05
--------------------------------------------------------------------------------
Task params {'fp_length': 20, 'l2_penalty': 0.1353352832366127, 'fp_depth': 3, 'conv_width': 10, 'init_scale': 0.01831563888873418}
Convnet fingerprints with neural net
Total number of weights in the network: 9891
max of weights 0.06962983567500523
Iteration 0 loss 1.0063718238912063 train RMSE 1.8203069398792293 Train R2 0 : -0.006295212875175871 Validation RMSE 0 : 1.803738578845338 Validation R2 0 : -0.019677344906125382 Dub RMSE 0 : 2.15276806272327
Unique RMSE 0 : 1.8101555237011158
Dub R2 0 : 0.01872