In [1]:
from argparse import ArgumentParser, Namespace
import os
import glob
import gzip
import numpy as np
import pandas as pd
import pickle, sys
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import DataStructs
from rdkit.Chem import PandasTools
#from rdkit.ML.Cluster import Butina
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import PLSRegression
from sklearn.manifold import MDS
from sklearn.manifold import Isomap
from sklearn.manifold import TSNE
import sklearn.feature_extraction
from sklearn.feature_selection import VarianceThreshold
import threading
from mpl_toolkits.axes_grid1 import make_axes_locatable
import pubchempy as pcp
from boruta import boruta_py
from mordred import Calculator, descriptors
import umap

In [2]:
def getMorganAsDict(mol,size=2):
    d = {}
    d.update(AllChem.GetMorganFingerprint(mol,size).GetNonzeroElements())
    return d
  
def get_fps(drugs, morgan=False, bit=False,size=2):
    fps=[]
    errors = 0
    for moli in range(len(drugs)):
        try:
            mol = drugs[moli]
            if morgan:
                d = getMorganAsDict(mol,size)
            elif bit:
                d = AllChem.GetMorganFingerprintAsBitVect(mol,size,nBits=2048)
            else:
                d = Chem.RDKFingerprint(mol)
            fps.append(d)
        except:
            errors+=1
    print('Errors in conversion:',errors)
    return fps
  
def fps_distances(fps):
    dist_mat = []
    for i,fp in enumerate(fps):
        try:
            dist_mat.append(DataStructs.BulkTanimotoSimilarity(fps[i],fps,returnDistance=1))
        except:
            print('Failed molecule nr:',i)
            continue
    return np.array(dist_mat)
  
def fps_similarities(fps):
    dist_mat = []
    for i,fp in enumerate(fps):
        try:
            dist_mat.append(DataStructs.BulkTanimotoSimilarity(fps[i],fps,returnDistance=False))
        except:
            print('Failed molecule nr:',i)
            continue
    return np.array(dist_mat)

def get_embedding(model, components, adist):
    if model == 'mds':
        print('Selected transformation: MDS')
        mds = MDS(n_components=components, dissimilarity="precomputed", random_state=6)
        results = mds.fit(adist)
        coords = results.embedding_
    if model == 'isomap':
        print('Selected transformation: IsoMap')
        iso = Isomap(n_components=components, random_state=6)
        coords = iso.fit_transform(adist)
        #coords = results.embedding_
    if model == 'tsne':
        print('Selected transformation: T-SNE')
        tsne = TSNE(n_components=components, verbose=1, perplexity=40, n_iter=300, random_state = 158984,metric='precomputed')
        coords = tsne.fit_transform(adist)
    if model == 'pca':
        print('Selected transformation: PCA')
        pca = PCA(n_components=components, random_state=6)
        coords = pca.fit_transform(adist)
    if model == 'pls':
        print('Selected transformation: PLS')
        pls = PLSRegression(n_components=components, random_state=6)
        coords = pls.fit_transform(adist)
    if model=='umap':
        print('Selected transformation: UMAP')
        print('Selected transformation: MDS')
        mds = MDS(n_components=adist.shape()[0], dissimilarity="precomputed", random_state=6)
        results = mds.fit(adist)
        adist = results.embedding_
        umap_ = umap.UMAP(n_neighbors=5, min_dist=0.3, n_components=components)
        coords = umap_.fit_transform(adist)
    return coords

def plot_transformation_2D(coords, coloring=None, figure=None, scale=None, color_correction=None,marker='.',s=None):
    if figure:
        fig, ax = figure
    else:
        fig = plt.figure(figsize = (8,8))
        ax = fig.add_subplot(111)
        ax.set_aspect('equal',adjustable='box')

    if coloring is not None:
        if scale == 'log':
            coloring = np.log10(np.array(coloring).astype(float))
        if scale == 'p':
            coloring = -np.log10(np.array(coloring).astype(float)*1e-9)
        coloring = np.around(np.array(coloring).astype(float), decimals = 1)
        #ax.scatter(coords[trainsize:, 0], coords[trainsize:, 1], marker = '.', c=gensize#,cmap=colors.Colormap('jet'))
        if color_correction is not None:
            norm = colors.Normalize(vmin=color_correction[0],vmax=color_correction[1])
        sc = ax.scatter(coords[:, 0], 
                        coords[:, 1], 
                        marker = marker, 
                        c=coloring, 
                        cmap=plt.get_cmap('jet'), 
                        norm=norm, s=s
                  )
    else:
        ax.scatter(coords[:, 0], 
                   coords[:, 1], 
                   marker = marker, 
                   c='k',s=s
                )
        #plt.colorbar(sc)
    return fig, ax, sc

def external_prep(incsv, insdf):
    ext_df = pd.read_csv(incsv)
    ext_sdf = Chem.ForwardSDMolSupplier(gzip.open(insdf))
    ext_mols = [x for x in ext_sdf if x is not None]
    ext_mol_df = pd.DataFrame({'Molecule':ext_mols,
                               'cid':[int(mol.GetProp('PUBCHEM_COMPOUND_CID')) for mol in ext_mols]})
    ext_mol_df.drop_duplicates(subset='cid', inplace=True)
    ext_mol_df.reset_index(drop=True, inplace=True)
    ext_df = ext_df.merge(ext_mol_df, on='cid', how='inner')
    return ext_df
  
def ClusterFps(fps,cutoff=0.2):
    from rdkit import DataStructs
    from rdkit.ML.Cluster import Butina

    # first generate the distance matrix:
    dists = []
    nfps = len(fps)
    for i in range(1,nfps):
        sims = DataStructs.BulkTanimotoSimilarity(fps[i],fps[:i])
        dists.extend([1-x for x in sims])

    # now cluster the data:
    cs = Butina.ClusterData(dists,nfps,cutoff,isDistData=True)
    return cs
  


In [3]:
def moses_parser_train():
    parser = argparse.ArgumentParser()
    subparsers = parser.add_subparsers(
        title='Models trainer script', description='available models'
    )
    for model in MODELS.get_model_names():
        add_train_args(
            MODELS.get_model_train_parser(model)(
                subparsers.add_parser(model)
            )
        )
    return parser
def moses_train(model, config, model_state = None):
    set_seed(config.seed)
    device = torch.device(config.device)

    if config.config_save is not None:
        torch.save(config, config.config_save)

    # For CUDNN to work properly
    if device.type.startswith('cuda'):
        torch.cuda.set_device(device.index or 0)

    train_data = read_smiles_csv(config.train_load)
    if config.val_load:
        val_data = read_smiles_csv(config.val_load)
    else:
        val_data = None
    trainer = MODELS.get_model_trainer(model)(config)

    if config.vocab_load is not None:
        assert os.path.exists(config.vocab_load), \
            'vocab_load path does not exist!'
        vocab = torch.load(config.vocab_load)
    else:
        vocab = trainer.get_vocabulary(train_data)

    if config.vocab_save is not None:
        torch.save(vocab, config.vocab_save)
    
    model = MODELS.get_model_class(model)(vocab, config)
    #print(model)
    if model_state:
      model.load_state_dict(model_state)
#       model_config = torch.load(config.config_load)
#     model_vocab = torch.load(config.vocab_load)
#     model_state = torch.load(config.model_load)

#     model = MODELS.get_model_class(model)(model_vocab, model_config)
#     model.load_state_dict(model_state)
#     model = model.to(device)
    
    #print(model)
    model = model.to(device)
    model.eval()
    trainer.fit(model, train_data, val_data)
    
    model = model.to('cpu')
    torch.save(model.state_dict(), config.model_save)
def moses_parser_generate():
    parser = argparse.ArgumentParser()
    subparsers = parser.add_subparsers(
        title='Models sampler script', description='available models')
    for model in MODELS.get_model_names():
        add_sample_args(subparsers.add_parser(model))
    return parser
def moses_generate(model, config):
    set_seed(config.seed)
    device = torch.device(config.device)

    # For CUDNN to work properly:
    if device.type.startswith('cuda'):
        torch.cuda.set_device(device.index or 0)

    model_config = torch.load(config.config_load)
    model_vocab = torch.load(config.vocab_load)
    model_state = torch.load(config.model_load)

    model = MODELS.get_model_class(model)(model_vocab, model_config)
    model.load_state_dict(model_state)
    model = model.to(device)
    model.eval()

    samples = []
    n = config.n_samples
    with tqdm(total=config.n_samples, desc='Generating samples') as T:
        while n > 0:
            current_samples = model.sample(
                min(n, config.n_batch), config.max_len
            )
            samples.extend(current_samples)

            n -= len(current_samples)
            T.update(len(current_samples))

    samples = pd.DataFrame(samples, columns=['SMILES'])
    samples.to_csv(config.gen_save, index=False)
def moses_parser_eval():
    parser = argparse.ArgumentParser()
    parser.add_argument('--test_path',
                        type=str, required=True,
                        help='Path to test molecules csv')
    parser.add_argument('--test_scaffolds_path',
                        type=str, required=False,
                        help='Path to scaffold test molecules csv')
    parser.add_argument('--gen_path',
                        type=str, required=True,
                        help='Path to generated molecules csv')
    parser.add_argument('--ks',
                        nargs='+', default=[1000, 10000],
                        help='Prefixes to calc uniqueness at')
    parser.add_argument('--n_jobs',
                        type=int, default=1,
                        help='Number of processes to run metrics')
    parser.add_argument('--device',
                        type=str, default='cpu',
                        help='GPU device id (`cpu` or `cuda:n`)')

    return parser
def moses_evaluate(config, print_metrics=True):
    test = read_smiles_csv(config.test_path)
    test_scaffolds = None
    ptest = None
    ptest_scaffolds = None
    if config.test_scaffolds_path is not None:
        test_scaffolds = read_smiles_csv(config.test_scaffolds_path)
    gen = read_smiles_csv(config.gen_path)
    metrics = get_all_metrics(test, gen, k=config.ks, n_jobs=config.n_jobs,
                              device=config.device,
                              test_scaffolds=test_scaffolds,
                              ptest=ptest, ptest_scaffolds=ptest_scaffolds)

    if print_metrics:
        for name, value in metrics.items():
            print('{},{}'.format(name, value))
    else:
        return metrics

In [4]:
def chemprop_grid_search(args: Namespace):
    # Create loggers
    logger = create_logger(name='hyperparameter_optimization', save_dir=args.log_dir, quiet=True)
    train_logger = create_logger(name='train', save_dir=args.save_dir, quiet=args.quiet)

    # Run grid search
    results = []

    # Define hyperparameter optimization
    def objective(hyperparams: Dict[str, Union[int, float]]) -> float:
        # Convert hyperparams from float to int when necessary
        for key in INT_KEYS:
            hyperparams[key] = int(hyperparams[key])

        # Update args with hyperparams
        hyper_args = deepcopy(args)
        if args.save_dir is not None:
            folder_name = '_'.join(f'{key}_{value}' for key, value in hyperparams.items())
            hyper_args.save_dir = os.path.join(hyper_args.save_dir, folder_name)
        for key, value in hyperparams.items():
            setattr(hyper_args, key, value)

        # Record hyperparameters
        logger.info(hyperparams)

        # Cross validate
        mean_score, std_score = cross_validate(hyper_args, train_logger)

        # Record results
        temp_model = build_model(hyper_args)
        num_params = param_count(temp_model)
        logger.info(f'num params: {num_params:,}')
        logger.info(f'{mean_score} +/- {std_score} {hyper_args.metric}')

        results.append({
            'mean_score': mean_score,
            'std_score': std_score,
            'hyperparams': hyperparams,
            'num_params': num_params
        })

        # Deal with nan
        if np.isnan(mean_score):
            if hyper_args.dataset_type == 'classification':
                mean_score = 0
            else:
                raise ValueError('Can\'t handle nan score for non-classification dataset.')

        return (1 if hyper_args.minimize_score else -1) * mean_score

    fmin(objective, SPACE, algo=tpe.suggest, max_evals=args.num_iters)

    # Report best result
    results = [result for result in results if not np.isnan(result['mean_score'])]
    best_result = min(results, key=lambda result: (1 if args.minimize_score else -1) * result['mean_score'])
    logger.info('best')
    logger.info(best_result['hyperparams'])
    logger.info(f'num params: {best_result["num_params"]:,}')
    logger.info(f'{best_result["mean_score"]} +/- {best_result["std_score"]} {args.metric}')

    # Save best hyperparameter settings as JSON config file
    makedirs(args.config_save_path, isfile=True)

    with open(args.config_save_path, 'w') as f:
        json.dump(best_result['hyperparams'], f, indent=4, sort_keys=True)