In [2]:
# Path to Neural Fingerprint scripts

import sys
sys.path
sys.path.append('../../scripts/baselines/neuralfingerprints/utils')



In [3]:
from sklearn.metrics import r2_score

In [4]:
import os, pickle
import autograd.numpy as np
import autograd.numpy.random as npr
from autograd import grad
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import DrawingOptions
import matplotlib.pyplot as plt

from build_vanilla_net import build_morgan_deep_net, relu, build_standard_net
from build_convnet import build_conv_deep_net, build_convnet_fingerprint_fun
from util import normalize_array, build_batched_grad
from optimizers import adam
from util import rmse
from mol_graph import degrees
from data_util import remove_duplicates


In [6]:
# new function for loading our datasets
def load_data(dataset_path = '../../data/3_final_data/split_data', prefix_name='logP_pH_range_mean', VALUE_COLUMN = 'logP', SMILES_COLUMN='smiles'):
    import pandas as pd
    import os
    from rdkit.Chem import MolFromSmiles
    
    def check_molecules(smiles):
        mol = MolFromSmiles(smiles)
        for atom in mol.GetAtoms():
            if atom.GetDegree() not in [0, 1, 2, 3, 4, 5]:
                with open('../../data/raw/broken_smiles_'+prefix_name+'.txt', 'a') as f:
                    f.write(smiles+'\n')
                return False
        return True
    
    with open('../../data/raw/broken_smiles_'+prefix_name+'.txt', 'w') as f:
        pass
    
    data_splits = ['train', 'test', 'validation']
    
    datasets = {}
    
    for split in data_splits:
        data = pd.read_csv(os.path.join(dataset_path,prefix_name+'_'+split+'.csv'))
#         data = data[data[SMILES_COLUMN].map(check_molecules)]
        datasets[split] = (data[SMILES_COLUMN].values, data[VALUE_COLUMN].values)
        
    
    return datasets

In [31]:
def parse_training_params(params):
    nn_train_params = {'num_epochs'  : num_epochs,
                       'batch_size'  : batch_size,
                       'param_scale' : params['init_scale']}

    vanilla_net_params = {'layer_sizes':[params['fp_length']],  # Linear regression.
                          'normalize':normalize,
                          'L2_reg': params['l2_penalty'],
                          'activation_function':activation}
    return nn_train_params, vanilla_net_params

def train_nn(pred_fun, loss_fun, num_weights, train_smiles, train_raw_targets, train_params,
             validation_smiles=None, validation_raw_targets=None):
    """loss_fun has inputs (weights, smiles, targets)"""
    print "Total number of weights in the network:", num_weights
    npr.seed(0)
    init_weights = npr.randn(num_weights) * train_params['param_scale']

    train_targets, undo_norm = normalize_array(train_raw_targets)
    training_curve = []
    def callback(weights, iter):
        if iter % 10 == 0:
            print "max of weights", np.max(np.abs(weights))
            train_preds = undo_norm(pred_fun(weights, train_smiles))
            cur_loss = loss_fun(weights, train_smiles, train_targets)
            training_curve.append(cur_loss)
            print "Iteration", iter, "loss", cur_loss, "train RMSE", \
                np.sqrt(np.mean((train_preds - train_raw_targets)**2)),
            print "Train R2", iter, ":", \
                    r2_score(train_raw_targets, train_preds),
            if validation_smiles is not None:
                validation_preds = undo_norm(pred_fun(weights, validation_smiles))
                print "Validation RMSE", iter, ":", \
                    np.sqrt(np.mean((validation_preds - validation_raw_targets) ** 2)),
                print "Validation R2", iter, ":", \
                    r2_score(validation_raw_targets, validation_preds),
            dub_preds = undo_norm(pred_fun(weights, dub_smiles))
            uniq_preds = undo_norm(pred_fun(weights, uniq_smiles))
            print "Dub RMSE", iter, ":", rmse(dub_preds, dub_targets)
            print "Unique RMSE", iter, ":", rmse(uniq_preds,  uniq_targets)
            print "Dub R2", iter, ":", r2_score(dub_targets, dub_preds)
            print "Unique R2", iter, ":", r2_score(uniq_targets, uniq_preds)

    grad_fun = grad(loss_fun)
    grad_fun_with_data = build_batched_grad(grad_fun, train_params['batch_size'],
                                            train_smiles, train_targets)

    num_iters = train_params['num_epochs'] * len(train_smiles) / train_params['batch_size']
    trained_weights = adam(grad_fun_with_data, init_weights, callback=callback,
                           num_iters=num_iters)
    
    def predict_func(new_smiles):
        """Returns to the original units that the raw targets were in."""
        return undo_norm(pred_fun(trained_weights, new_smiles))
    return predict_func, trained_weights, training_curve


def train_neural_fingerprint():
    print "Loading data..."
    data = load_data(prefix_name = task_params['data_file'], VALUE_COLUMN = task_params['target_name'])

    train_inputs, train_targets = data['train']
    val_inputs,   val_targets   = data['validation']
    test_inputs,  test_targets  = data['test']

    print "Regression on", len(train_inputs), "training points."
    def print_performance(pred_func):
        train_preds = pred_func(train_inputs)
        test_preds = pred_func(test_inputs)
        dub_preds = pred_func(dub_smiles)
        uniq_preds = pred_func(uniq_smiles)
        print "\nPerformance (RMSE) on " + task_params['target_name'] + ":"
        print "Train:", rmse(train_preds, train_targets)
        print "Test: ", rmse(test_preds,  test_targets)
        print "Dub:", rmse(dub_preds, dub_targets)
        print "Unique: ", rmse(uniq_preds,  uniq_targets)
        
        print "\nPerformance (R2) on " + task_params['target_name'] + ":"
        print "Train:", r2_score(train_targets, train_preds)
        print "Test: ", r2_score(test_targets, test_preds)
        print "Dub:", r2_score(dub_targets, dub_preds)
        print "Unique: ", r2_score(uniq_targets, uniq_preds)
        print "-" * 80
        return rmse(test_preds,  test_targets)

    print "-" * 80
    print "Mean predictor"
    y_train_mean = np.mean(train_targets)
    print_performance(lambda x : y_train_mean*np.ones(len(x)))

    print "Task params", params
    nn_train_params, vanilla_net_params = parse_training_params(params)
    conv_arch_params['return_atom_activations'] = False

    print "Convnet fingerprints with neural net"
    loss_fun, pred_fun, conv_parser = \
        build_conv_deep_net(conv_arch_params, vanilla_net_params, params['l2_penalty'])
    num_weights = len(conv_parser)
    predict_func, trained_weights, conv_training_curve = \
         train_nn(pred_fun, loss_fun, num_weights, train_inputs, train_targets,
                 nn_train_params, validation_smiles=val_inputs, validation_raw_targets=val_targets)
    print_performance(predict_func)
    return trained_weights


def draw_molecule_with_highlights(filename, smiles, highlight_atoms):
    drawoptions = DrawingOptions()
    drawoptions.selectColor = highlight_color
    drawoptions.elemDict = {}   # Don't color nodes based on their element.
    drawoptions.bgColor=None

    mol = Chem.MolFromSmiles(smiles)
    fig = Draw.MolToMPL(mol, highlightAtoms=highlight_atoms, size=figsize, options=drawoptions,fitImage=False)

    fig.gca().set_axis_off()
    fig.savefig(filename, bbox_inches='tight')
    plt.close(fig)


def construct_atom_neighbor_list(array_rep):
    atom_neighbour_list = []
    for degree in degrees:
        atom_neighbour_list += [list(neighbours) for neighbours in array_rep[('atom_neighbors', degree)]]
    return atom_neighbour_list


def plot(trained_weights, FIGURE_PATH = '../../data/raw'):
    
    print "Loading data..."
    data = load_data(prefix_name = task_params['data_file'], VALUE_COLUMN = task_params['target_name'])

    train_smiles, train_targets = data['train']
    val_inputs,   val_targets   = data['validation']
    test_inputs,  test_targets  = data['test']

    print "Convnet fingerprints with neural net"
    conv_arch_params['return_atom_activations'] = True
    output_layer_fun, parser, compute_atom_activations = \
       build_convnet_fingerprint_fun(**conv_arch_params)
    atom_activations, array_rep = compute_atom_activations(trained_weights, train_smiles)

    if not os.path.exists(os.path.join(FIGURE_PATH, 'figures')): os.makedirs(os.path.join(FIGURE_PATH, 'figures'))

    parent_molecule_dict = {}
    for mol_ix, atom_ixs in enumerate(array_rep['atom_list']):
        for atom_ix in atom_ixs:
            parent_molecule_dict[atom_ix] = mol_ix

    atom_neighbor_list = construct_atom_neighbor_list(array_rep)

    def get_neighborhood_ixs(array_rep, cur_atom_ix, radius):
        # Recursive function to get indices of all atoms in a certain radius.
        if radius == 0:
            return set([cur_atom_ix])
        else:
            cur_set = set([cur_atom_ix])
            for n_ix in atom_neighbor_list[cur_atom_ix]:
                cur_set.update(get_neighborhood_ixs(array_rep, n_ix, radius-1))
            return cur_set

    # Recreate trained network.
    nn_train_params, vanilla_net_params = parse_training_params(params)
    conv_arch_params['return_atom_activations'] = False
    _, _, combined_parser = \
        build_conv_deep_net(conv_arch_params, vanilla_net_params, params['l2_penalty'])

    net_loss_fun, net_pred_fun, net_parser = build_standard_net(**vanilla_net_params)
    net_weights = combined_parser.get(trained_weights, 'net weights')
    last_layer_weights = net_parser.get(net_weights, ('weights', 0))

    for fp_ix in range(params['fp_length']):
        print "FP {0} has linear regression coefficient {1}".format(fp_ix, last_layer_weights[fp_ix][0])
        combined_list = []
        for radius in all_radii:
            fp_activations = atom_activations[radius][:, fp_ix]
            combined_list += [(fp_activation, atom_ix, radius) for atom_ix, fp_activation in enumerate(fp_activations)]

        unique_list = remove_duplicates(combined_list, key_lambda=lambda x: x[0])
        combined_list = sorted(unique_list, key=lambda x: -x[0])

        for fig_ix in range(num_figs_per_fp):
            # Find the most-activating atoms for this fingerprint index, across all molecules and depths.
            activation, most_active_atom_ix, cur_radius = combined_list[fig_ix]
            most_activating_mol_ix = parent_molecule_dict[most_active_atom_ix]
            highlight_list_our_ixs = get_neighborhood_ixs(array_rep, most_active_atom_ix, cur_radius)
            highlight_list_rdkit = [array_rep['rdkit_ix'][our_ix] for our_ix in highlight_list_our_ixs]

            print "radius:", cur_radius, "atom list:", highlight_list_rdkit, "activation", activation
            draw_molecule_with_highlights(\
                os.path.join(FIGURE_PATH, 'figures',"fp_{0}_highlight_{1}.pdf".format(fp_ix, fig_ix)),
                train_smiles[most_activating_mol_ix],
                highlight_atoms=highlight_list_rdkit)

In [12]:
def get_averaged_and_unique_smiles(dataset_name, \
                                   datasets_path = "../../data/3_final_data", \
                                   not_averaged_dataset_name = 'logP.csv', \
                                   SMILES_COLUMN = 'smiles',\
                                   VALUE_COLUMN = 'logP'):
    """get smiles and logP with unique and duplicated measurements"""
    logP_dataset = pd.read_csv(os.path.join(datasets_path, not_averaged_dataset_name))
    dataset = pd.read_csv(os.path.join(datasets_path, dataset_name))
    
    duplicates_smiles = logP_dataset.groupby([SMILES_COLUMN]).count()
    
    smiles_dub = list(duplicates_smiles[duplicates_smiles[VALUE_COLUMN]>1].index)   
    smiles_uniq = list(duplicates_smiles[duplicates_smiles[VALUE_COLUMN]==1].index)  
    
    dub_data = dataset[dataset[SMILES_COLUMN].isin(smiles_dub)]
    uniq_data = dataset[dataset[SMILES_COLUMN].isin(smiles_uniq)]
    
    dub_smiles, dub_targets = list(dub_data[SMILES_COLUMN]), list(dub_data[VALUE_COLUMN])
    uniq_smiles, uniq_targets = list(uniq_data[SMILES_COLUMN]), list(uniq_data[VALUE_COLUMN])
    
    return dub_smiles, dub_targets, uniq_smiles, uniq_targets

## logP_mean

### Visualization

In [27]:
task_params = {'target_name' : 'logP',
               'data_file'   : 'logp_mean'}

num_epochs = 1
batch_size = 100
normalize = 1
dropout = 0
activation = relu
params = {'fp_length': 20,
            'fp_depth': 3,
            'init_scale':np.exp(-4),
            'learn_rate':np.exp(-4),
                    'b1':np.exp(-4),
                    'b2':np.exp(-4),
            'l2_penalty':np.exp(-4),
            'l1_penalty':np.exp(-5),
            'conv_width':10}

conv_layer_sizes = [params['conv_width']] * params['fp_depth']
conv_arch_params = {'num_hidden_features' : conv_layer_sizes,
                    'fp_length' : params['fp_length'],
                    'normalize' : normalize,
                    'return_atom_activations':False}

all_radii = range(params['fp_depth'] + 1)

# Plotting parameters
num_figs_per_fp = 11
figsize = (100, 100)
highlight_color = (30.0/255.0, 100.0/255.0, 255.0/255.0)  # A nice light blue.

In [32]:
trained_network_weights = train_neural_fingerprint()
with open('results.pkl', 'w') as f:
    pickle.dump(trained_network_weights, f)

# Plotting.
with open('results.pkl') as f:
    trained_weights = pickle.load(f)
plot(trained_weights)

Loading data...
Regression on 9631 training points.
--------------------------------------------------------------------------------
Mean predictor

Performance (RMSE) on logP:
Train: 1.8591219922763682
Test:  1.9087278157747403
Dub: 2.178198827192713
Unique:  1.8030779775623207

Performance (R2) on logP:
Train: 0.0
Test:  -1.8278865013598988e-05
Dub: -0.004600498132678821
Unique:  -0.0018443796611973262
--------------------------------------------------------------------------------
Task params {'learn_rate': 0.01831563888873418, 'fp_depth': 3, 'b1': 0.01831563888873418, 'b2': 0.01831563888873418, 'init_scale': 0.01831563888873418, 'fp_length': 20, 'l2_penalty': 0.01831563888873418, 'l1_penalty': 0.006737946999085467, 'conv_width': 10}
Convnet fingerprints with neural net
Total number of weights in the network: 9891
max of weights 0.06962983567500523
Iteration 0 loss 1.0015408196365463 train RMSE 1.8605440963555582 Train R2 0 : -0.001530451462992577 Validation RMSE 0 : 1.8694340847349

radius: 3 atom list: [7, 1, 3, 6, 0, 2, 4, 5, 8] activation 0.09927388497075428
radius: 3 atom list: [4, 7, 10, 11, 14, 9, 15, 8, 12, 13, 16] activation 0.09926514529688635
radius: 3 atom list: [1, 2, 4, 6, 8, 10, 3, 5, 7, 9, 11] activation 0.09923963651257624
radius: 3 atom list: [0, 2, 32, 33, 1, 3, 5, 4, 31] activation 0.09913396014719324
FP 4 has linear regression coefficient -0.0370703061898
radius: 3 atom list: [0, 3, 5, 4, 6, 8, 7, 2, 1, 10, 9] activation 0.09925203115321078
radius: 3 atom list: [5, 7, 9, 1, 4, 6, 8, 2, 10, 0, 3] activation 0.09895188280117369
radius: 3 atom list: [34, 30, 33, 35, 41, 43, 45, 42, 44, 46, 47, 32] activation 0.09753845017580777
radius: 3 atom list: [2, 0, 3, 5, 6, 1, 4] activation 0.09695303982831739
radius: 3 atom list: [7, 10, 13, 14, 9, 12, 11, 16, 15] activation 0.09687574117479231
radius: 3 atom list: [1, 2, 4, 6, 3, 5, 16, 17, 18] activation 0.09662608032070982
radius: 3 atom list: [4, 7, 10, 11, 14, 9, 15, 8, 12, 13, 16] activation 0.096339

radius: 3 atom list: [4, 7, 10, 11, 14, 9, 15, 8, 12, 13, 16] activation 0.09708742891611805
radius: 3 atom list: [1, 2, 4, 6, 8, 10, 0, 3, 5, 7, 9] activation 0.09707213774448331
radius: 3 atom list: [7, 1, 3, 6, 0, 2, 4, 5, 8] activation 0.09703193435640178
radius: 3 atom list: [0, 2, 32, 33, 1, 3, 5, 4, 31] activation 0.09692153302924106
radius: 3 atom list: [1, 2, 4, 6, 8, 10, 3, 5, 7, 9, 11] activation 0.09691820045912501
FP 13 has linear regression coefficient -0.019693963584
radius: 3 atom list: [0, 3, 5, 4, 6, 8, 7, 2, 1, 10, 9] activation 0.09686448990023876
radius: 3 atom list: [5, 7, 9, 1, 4, 6, 8, 2, 10, 0, 3] activation 0.09661971678910115
radius: 3 atom list: [34, 30, 33, 35, 41, 43, 45, 42, 44, 46, 47, 32] activation 0.09546331675683599
radius: 3 atom list: [2, 0, 3, 5, 6, 1, 4] activation 0.09494148814455165
radius: 3 atom list: [7, 10, 13, 14, 9, 12, 11, 16, 15] activation 0.09489225966481231
radius: 3 atom list: [1, 2, 4, 6, 3, 5, 16, 17, 18] activation 0.094680271918

In [33]:
task_params = {'target_name' : 'logP',
               'data_file'   : 'logp_mean'}

num_epochs = 5
batch_size = 100
normalize = 1
dropout = 0
activation = relu
params = {'fp_length': 50,
            'fp_depth': 4,
            'init_scale':np.exp(-4),
            'learn_rate':np.exp(-4),
                    'b1':np.exp(-4),
                    'b2':np.exp(-4),
            'l2_penalty':np.exp(-4),
            'l1_penalty':np.exp(-5),
            'conv_width':20}

conv_layer_sizes = [params['conv_width']] * params['fp_depth']
conv_arch_params = {'num_hidden_features' : conv_layer_sizes,
                    'fp_length' : params['fp_length'],
                    'normalize' : normalize,
                    'return_atom_activations':False}

all_radii = range(params['fp_depth'] + 1)

# Plotting parameters
num_figs_per_fp = 11
figsize = (100, 100)
highlight_color = (30.0/255.0, 100.0/255.0, 255.0/255.0)  # A nice light blue.

In [None]:
trained_network_weights = train_neural_fingerprint()
with open('results.pkl', 'w') as f:
    pickle.dump(trained_network_weights, f)

# Plotting.
with open('results.pkl') as f:
    trained_weights = pickle.load(f)
plot(trained_weights)

Loading data...
Regression on 9631 training points.
--------------------------------------------------------------------------------
Mean predictor

Performance (RMSE) on logP:
Train: 1.8591219922763682
Test:  1.9087278157747403
Dub: 2.178198827192713
Unique:  1.8030779775623207

Performance (R2) on logP:
Train: 0.0
Test:  -1.8278865013598988e-05
Dub: -0.004600498132678821
Unique:  -0.0018443796611973262
--------------------------------------------------------------------------------
Task params {'learn_rate': 0.01831563888873418, 'fp_depth': 4, 'b1': 0.01831563888873418, 'b2': 0.01831563888873418, 'init_scale': 0.01831563888873418, 'fp_length': 50, 'l2_penalty': 0.01831563888873418, 'l1_penalty': 0.006737946999085467, 'conv_width': 20}
Convnet fingerprints with neural net
Total number of weights in the network: 30571
max of weights 0.08535001578936458
Iteration 0 loss 1.0049397981323287 train RMSE 1.86369714187141 Train R2 0 : -0.004927895158615581 Validation RMSE 0 : 1.85822046516658

### Compare error for SMILES with duplicates and without

In [21]:
task_params = {'target_name' : 'logP',
               'data_file'   : 'logp_mean'}

num_epochs = 1
batch_size = 100
normalize = 1
dropout = 0
activation = relu
params = {'fp_length': 20,
            'fp_depth': 3,
            'init_scale':np.exp(-4),
            'l2_penalty':np.exp(-2),
            'conv_width':10}

conv_layer_sizes = [params['conv_width']] * params['fp_depth']
conv_arch_params = {'num_hidden_features' : conv_layer_sizes,
                    'fp_length' : params['fp_length'],
                    'normalize' : normalize,
                    'return_atom_activations':False}

all_radii = range(params['fp_depth'] + 1)

# Plotting parameters
num_figs_per_fp = 11
figsize = (100, 100)
highlight_color = (30.0/255.0, 100.0/255.0, 255.0/255.0)  # A nice light blue.

In [22]:
dub_smiles, dub_targets, uniq_smiles, uniq_targets =  get_averaged_and_unique_smiles(task_params['data_file']+'.csv')
trained_network_weights = train_neural_fingerprint()

Loading data...
Regression on 9631 training points.
--------------------------------------------------------------------------------
Mean predictor

Performance (RMSE) on logP:
Train: 1.8591219922763682
Test:  1.9087278157747403
Dub: 2.196473320235497
Unique:  1.8589662274903223

Performance (R2) on logP:
Train: 0.0
Test:  -1.8278865013598988e-05
Dub: -0.004552022728161242
Unique:  -1.558775978693916e-05
--------------------------------------------------------------------------------
Task params {'fp_length': 20, 'l2_penalty': 0.1353352832366127, 'fp_depth': 3, 'conv_width': 10, 'init_scale': 0.01831563888873418}
Convnet fingerprints with neural net
Total number of weights in the network: 9891
max of weights 0.06962983567500523
Iteration 0 loss 1.0016070624790232 train RMSE 1.8605440963555582 Train R2 0 : -0.001530451462992577 Validation RMSE 0 : 1.8694340847349085 Validation R2 0 : -0.015949090875271965 Dub RMSE 0 : 2.1786428990626363
Unique RMSE 0 : 1.8633197123142058
Dub R2 0 : 0.01

## logP_wo_parameters

### Visualization

In [None]:
task_params = {'target_name' : 'logP',
               'data_file'   : 'logP_wo_parameters'}

num_epochs = 5
batch_size = 100
normalize = 1
dropout = 0
activation = relu
params = {'fp_length': 50,
            'fp_depth': 4,
            'init_scale':np.exp(-4),
            'l2_penalty':np.exp(-2),
            'conv_width':20}

conv_layer_sizes = [params['conv_width']] * params['fp_depth']
conv_arch_params = {'num_hidden_features' : conv_layer_sizes,
                    'fp_length' : params['fp_length'],
                    'normalize' : normalize,
                    'return_atom_activations':False}

all_radii = range(params['fp_depth'] + 1)

# Plotting parameters
num_figs_per_fp = 11
figsize = (100, 100)
highlight_color = (30.0/255.0, 100.0/255.0, 255.0/255.0)  # A nice light blue.

In [None]:
trained_network_weights = train_neural_fingerprint()
with open('results.pkl', 'w') as f:
    pickle.dump(trained_network_weights, f)

# Plotting.
with open('results.pkl') as f:
    trained_weights = pickle.load(f)
plot(trained_weights)

In [29]:
trained_network_weights = train_neural_fingerprint()
with open('results.pkl', 'w') as f:
    pickle.dump(trained_network_weights, f)

# Plotting.
with open('results.pkl') as f:
    trained_weights = pickle.load(f)
plot(trained_weights)

Loading data...
Regression on 8837 training points.
--------------------------------------------------------------------------------
Mean predictor

Performance (RMSE) on logP:
Train: 1.8146549349304573
Test:  1.786769730059924
--------------------------------------------------------------------------------
Task params {'fp_length': 50, 'l2_penalty': 0.1353352832366127, 'fp_depth': 4, 'conv_width': 20, 'init_scale': 0.01831563888873418}
Convnet fingerprints with neural net
Total number of weights in the network: 27441
max of weights 0.08144291040373952
Iteration 0 loss 0.9886515906588851 train RMSE 1.8042531705773819 Validation RMSE 0 : 1.782187447872895 max of weights 0.08015057691461697
Iteration 10 loss 1.00953598902406 train RMSE 1.8232229054122153 Validation RMSE 10 : 1.8078542461999854 max of weights 0.07794223909831255
Iteration 20 loss 0.9836639417604071 train RMSE 1.799710182979298 Validation RMSE 20 : 1.775675355283461 max of weights 0.08258433888104313
Iteration 30 loss 0.98

radius: 3 atom list: [9, 1, 4, 5, 8, 3, 10] activation 0.03688111516210626
radius: 3 atom list: [7, 8, 10, 12, 14, 9, 11, 13] activation 0.03688006013900692
radius: 3 atom list: [11, 13, 15, 7, 10, 12, 14, 9] activation 0.03687983483591856
FP 4 has linear regression coefficient -0.0674564992073
radius: 2 atom list: [15, 14, 17, 18, 16, 12, 13] activation 0.07263332693084341
radius: 2 atom list: [9, 8, 5] activation 0.07235106461077892
radius: 2 atom list: [2, 3, 1] activation 0.07233746009289359
radius: 2 atom list: [15, 16, 14, 18, 17, 13] activation 0.07227312238876252
radius: 2 atom list: [2, 0, 1] activation 0.07172148223888844
radius: 2 atom list: [10, 12, 11] activation 0.07161859330708381
radius: 2 atom list: [15, 16, 14, 18, 17, 13] activation 0.07125508433223503
radius: 2 atom list: [6, 5, 7, 8, 9] activation 0.06784753921174579
radius: 2 atom list: [3, 4, 1] activation 0.06761490266998506
radius: 2 atom list: [0, 3, 4, 1, 2] activation 0.06663839644539474
radius: 2 atom list:

radius: 2 atom list: [2, 7, 8, 9, 16, 6, 10, 15, 17, 22] activation 0.032855760202695324
radius: 2 atom list: [4, 5, 6, 12, 3, 7, 11, 13, 18, 19] activation 0.032843916431971286
radius: 2 atom list: [22, 2, 6, 4, 3, 24, 5, 10, 18, 23] activation 0.03260556641020427
radius: 2 atom list: [4, 2, 3, 5, 6, 7, 13, 15, 16, 17] activation 0.03253848986335458
radius: 2 atom list: [4, 5, 6, 13, 3, 7, 12, 14, 18, 19] activation 0.03249104972292756
radius: 2 atom list: [3, 4, 5, 6, 12, 13, 17, 18, 14, 7] activation 0.032456611251177575
radius: 2 atom list: [15, 1, 4, 5, 14, 3, 6, 24, 13] activation 0.03244203223324115
radius: 2 atom list: [2, 18, 3, 5, 6, 7, 14, 16, 17, 4] activation 0.032408596541668194
radius: 2 atom list: [15, 9, 3, 5, 6, 7, 12, 14, 16, 8] activation 0.03239373144916207
radius: 2 atom list: [16, 2, 4, 6, 7, 8, 5, 18, 20, 19] activation 0.03237881662672333
radius: 2 atom list: [16, 3, 5, 6, 13, 15, 2, 7, 17, 4] activation 0.03236442825825944
FP 14 has linear regression coefficie

radius: 3 atom list: [8, 9, 10, 11, 12, 13, 14] activation 0.06745212504911544
radius: 3 atom list: [1, 2, 3, 4, 5, 6, 0] activation 0.06736070466556014
radius: 3 atom list: [21, 15, 16, 17, 18, 19, 20] activation 0.06734872400582491
FP 22 has linear regression coefficient 0.0485123786553
radius: 2 atom list: [2, 7, 8, 9, 16, 6, 10, 15, 17, 22] activation 0.03891463956639332
radius: 2 atom list: [4, 5, 6, 12, 3, 7, 11, 13, 18, 19] activation 0.03888369810899686
radius: 2 atom list: [4, 2, 3, 5, 6, 7, 13, 15, 16, 17] activation 0.03848435799501331
radius: 2 atom list: [22, 2, 6, 4, 3, 24, 5, 10, 18, 23] activation 0.03846792247654398
radius: 2 atom list: [15, 1, 4, 5, 14, 3, 6, 24, 13] activation 0.038354018799339154
radius: 2 atom list: [2, 18, 3, 5, 6, 7, 14, 16, 17, 4] activation 0.03824897640751383
radius: 2 atom list: [15, 9, 3, 5, 6, 7, 12, 14, 16, 8] activation 0.03823540972349227
radius: 2 atom list: [16, 2, 4, 6, 7, 8, 5, 18, 20, 19] activation 0.03822178109879029
radius: 2 ato

radius: 2 atom list: [15, 16, 14, 18, 17, 13] activation 0.07077890973992583
radius: 2 atom list: [3, 4, 1] activation 0.06879605989088831
radius: 2 atom list: [6, 5, 7, 8, 9] activation 0.06860070598033363
FP 31 has linear regression coefficient -0.0274620576738
radius: 1 atom list: [1, 0] activation 0.02801805741838737
radius: 1 atom list: [3, 2] activation 0.0280004733610676
radius: 1 atom list: [3, 5] activation 0.027892955209480402
radius: 1 atom list: [0, 1] activation 0.027759402721880416
radius: 1 atom list: [18, 17] activation 0.027666407481306495
radius: 1 atom list: [18, 21] activation 0.02756696982815133
radius: 1 atom list: [14, 13] activation 0.027562056498218693
radius: 1 atom list: [7, 8] activation 0.02755966037600879
radius: 1 atom list: [15, 16] activation 0.027554806874065023
radius: 1 atom list: [15, 16] activation 0.027518627262030027
radius: 1 atom list: [13, 14] activation 0.027499315917395517
FP 32 has linear regression coefficient 0.12921246698
radius: 4 atom 

FP 40 has linear regression coefficient -0.0354881487666
radius: 1 atom list: [9, 8] activation 0.03427144674794161
radius: 1 atom list: [15, 16, 14, 18, 17, 13] activation 0.03358054834053237
radius: 1 atom list: [3, 4] activation 0.03299661337020233
radius: 1 atom list: [6, 9] activation 0.03297020209571208
radius: 1 atom list: [18, 17] activation 0.032934216589622076
radius: 1 atom list: [7, 8] activation 0.0329180104602864
radius: 1 atom list: [9, 6] activation 0.032898419487616654
radius: 1 atom list: [13, 14] activation 0.032894218591640434
radius: 1 atom list: [0, 1] activation 0.032818986985712305
radius: 1 atom list: [1, 0] activation 0.03272626430368111
radius: 1 atom list: [3, 2] activation 0.03267710603359108
FP 41 has linear regression coefficient -0.0320808855754
radius: 1 atom list: [1, 0] activation 0.02993885778813455
radius: 1 atom list: [3, 2] activation 0.029813399975502447
radius: 1 atom list: [3, 5] activation 0.029752788204433903
radius: 1 atom list: [0, 1] activ

### Compare error for SMILES with duplicates and without

In [None]:
task_params = {'target_name' : 'logP',
               'data_file'   : 'logP_wo_parameters'}

num_epochs = 1
batch_size = 100
normalize = 1
dropout = 0
activation = relu
params = {'fp_length': 20,
            'fp_depth': 3,
            'init_scale':np.exp(-4),
            'l2_penalty':np.exp(-2),
            'conv_width':10}

conv_layer_sizes = [params['conv_width']] * params['fp_depth']
conv_arch_params = {'num_hidden_features' : conv_layer_sizes,
                    'fp_length' : params['fp_length'],
                    'normalize' : normalize,
                    'return_atom_activations':False}

all_radii = range(params['fp_depth'] + 1)

# Plotting parameters
num_figs_per_fp = 11
figsize = (100, 100)
highlight_color = (30.0/255.0, 100.0/255.0, 255.0/255.0)  # A nice light blue.

In [None]:
dub_smiles, dub_targets, uniq_smiles, uniq_targets =  get_averaged_and_unique_smiles(task_params['data_file']+'.csv')
trained_network_weights = train_neural_fingerprint()