In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tqdm
import ray
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors
from scipy import sparse
import logging

In [2]:
ray.init()

2021-05-26 19:40:09,185	INFO services.py:1267 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


{'node_ip_address': '172.31.34.223',
 'raylet_ip_address': '172.31.34.223',
 'redis_address': '172.31.34.223:6379',
 'object_store_address': '/tmp/ray/session_2021-05-26_19-40-08_300604_16307/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2021-05-26_19-40-08_300604_16307/sockets/raylet',
 'webui_url': '127.0.0.1:8265',
 'session_dir': '/tmp/ray/session_2021-05-26_19-40-08_300604_16307',
 'metrics_export_port': 61120,
 'node_id': 'e99384c38e3f2540aa3c7c0ecca53d3b6a104b0be5df1f2510a9a0d0'}

In [17]:
RECEPTOR = "EnamineHTS"
INPUT_DATA = f"/mnt/efs/enamine/EnamineHTS_scores.csv"
OUTPUT_DATA_DIR = "../processed_data"

In [8]:
def get_data():
    ligands_df = pd.read_csv(INPUT_DATA)
    
    return ligands_df

# Fingerprint molecules
The whole set of fingerprints won't fit in memory (even sparse) so we have to save them as chunks. This iterates over the SMILES codes, generating fingerprint_matrices and score arrays, saving them as chunks of 10,000,000

In [9]:
@ray.remote
def create_fingerprint(smiles, score, i):
    if i % 10000 == 0:
        logging.basicConfig(level=logging.INFO)
        logging.info(i)
        
    mol = Chem.MolFromSmiles(smiles)
    pars = { "radius": 2,
             "nBits": 8192,
             "invariants": [],
             "fromAtoms": [],
             "useChirality": False,
             "useBondTypes": True,
             "useFeatures": True,
    }
    fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, **pars)
    onbits = list(fp.GetOnBits())

    return onbits, float(score)

In [12]:
def get_fingerprints(ligands_df, fp_size=8192):
    future_values = [create_fingerprint.remote(smiles=smiles, score=score, i=i) for (i, (smiles, score)) in enumerate(zip(ligands_df["smiles"], ligands_df["score"]))]
    
    values = [v for v in ray.get(future_values) if v]
    all_bits, scores = zip(*values)
    
    row_idx = []
    col_idx = []
    for i, bits in enumerate(all_bits):
        # these bits all have the same row:
        row_idx += [i] * len(bits)
        #and the column indices of those bits:
        col_idx += bits
    
    # generate a sparse matrix out of the row,col indices:
    unfolded_size = 8192
    fingerprint_matrix = sparse.coo_matrix((np.ones(len(row_idx)).astype(bool), 
                                           (row_idx, col_idx)), 
                                           shape=(max(row_idx)+1, unfolded_size))
    
    # convert to csr matrix, it is better:
    fingerprint_matrix =  sparse.csr_matrix(fingerprint_matrix)

    return fingerprint_matrix, scores

In [13]:
ligands_df = get_data()

In [15]:
fingerprint_matrix, scores = get_fingerprints(ligands_df=ligands_df)

[2m[36m(pid=16453)[0m INFO:root:0
[2m[36m(pid=16447)[0m INFO:root:10000
[2m[36m(pid=16446)[0m INFO:root:20000
[2m[36m(pid=16448)[0m INFO:root:30000
[2m[36m(pid=16454)[0m INFO:root:40000
[2m[36m(pid=16445)[0m INFO:root:50000
[2m[36m(pid=16451)[0m INFO:root:60000
[2m[36m(pid=16450)[0m INFO:root:70000
[2m[36m(pid=16452)[0m INFO:root:80000
[2m[36m(pid=16454)[0m INFO:root:90000
[2m[36m(pid=16449)[0m INFO:root:100000
[2m[36m(pid=16450)[0m INFO:root:110000
[2m[36m(pid=16452)[0m INFO:root:120000
[2m[36m(pid=16461)[0m INFO:root:130000
[2m[36m(pid=16444)[0m INFO:root:140000
[2m[36m(pid=16452)[0m INFO:root:150000
[2m[36m(pid=16444)[0m INFO:root:160000
[2m[36m(pid=16459)[0m INFO:root:170000
[2m[36m(pid=16452)[0m INFO:root:180000
[2m[36m(pid=16465)[0m INFO:root:190000
[2m[36m(pid=16449)[0m INFO:root:200000
[2m[36m(pid=16461)[0m INFO:root:210000
[2m[36m(pid=16444)[0m INFO:root:220000
[2m[36m(pid=16465)[0m INFO:root:230000
[2m[

[2m[36m(pid=16446)[0m INFO:root:1940000
[2m[36m(pid=16452)[0m INFO:root:1950000
[2m[36m(pid=16446)[0m INFO:root:1960000
[2m[36m(pid=16446)[0m INFO:root:1970000
[2m[36m(pid=16461)[0m INFO:root:1980000
[2m[36m(pid=16453)[0m INFO:root:1990000
[2m[36m(pid=16446)[0m INFO:root:2000000
[2m[36m(pid=16451)[0m INFO:root:2010000
[2m[36m(pid=16449)[0m INFO:root:2020000
[2m[36m(pid=16450)[0m INFO:root:2030000
[2m[36m(pid=16465)[0m INFO:root:2040000
[2m[36m(pid=16461)[0m INFO:root:2050000
[2m[36m(pid=16443)[0m INFO:root:2060000
[2m[36m(pid=16449)[0m INFO:root:2070000
[2m[36m(pid=16465)[0m INFO:root:2080000
[2m[36m(pid=16453)[0m INFO:root:2090000
[2m[36m(pid=16459)[0m INFO:root:2100000


In [18]:
sparse.save_npz(f"{OUTPUT_DATA_DIR}/{RECEPTOR}_fingerprints.npz", fingerprint_matrix)
np.save(f"{OUTPUT_DATA_DIR}/{RECEPTOR}_scores.npy", np.array(scores))

Traceback (most recent call last):
  File "/home/ubuntu/anaconda3/envs/dockop/lib/python3.8/site-packages/ray/_private/monitor.py", line 284, in run
    self._run()
  File "/home/ubuntu/anaconda3/envs/dockop/lib/python3.8/site-packages/ray/_private/monitor.py", line 175, in _run
    self.update_load_metrics()
  File "/home/ubuntu/anaconda3/envs/dockop/lib/python3.8/site-packages/ray/_private/monitor.py", line 139, in update_load_metrics
    response = self.gcs_node_resources_stub.GetAllResourceUsage(
  File "/home/ubuntu/anaconda3/envs/dockop/lib/python3.8/site-packages/grpc/_channel.py", line 946, in __call__
    return _end_unary_response_blocking(state, call, False, None)
  File "/home/ubuntu/anaconda3/envs/dockop/lib/python3.8/site-packages/grpc/_channel.py", line 849, in _end_unary_response_blocking
    raise _InactiveRpcError(state)
grpc._channel._InactiveRpcError: <_InactiveRpcError of RPC that terminated with:
	status = StatusCode.DEADLINE_EXCEEDED
	details = "Deadline Exceeded