In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tqdm
import ray
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors
from scipy import sparse
import logging

In [2]:
ray.init()

2021-05-26 15:28:36,206	INFO services.py:1267 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


{'node_ip_address': '172.31.34.223',
 'raylet_ip_address': '172.31.34.223',
 'redis_address': '172.31.34.223:6379',
 'object_store_address': '/tmp/ray/session_2021-05-26_15-28-35_313386_9827/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2021-05-26_15-28-35_313386_9827/sockets/raylet',
 'webui_url': '127.0.0.1:8265',
 'session_dir': '/tmp/ray/session_2021-05-26_15-28-35_313386_9827',
 'metrics_export_port': 59591,
 'node_id': '75a8b87c7ba995ea45fc77b70fa42e8f447a553f1da0eace33bdfcca'}

In [35]:
RECEPTOR = "FEN1"
INPUT_DATA_DIR = f"/mnt/efs/lit-pcba/{RECEPTOR}"
OUTPUT_DATA_DIR = "../processed_data"

In [5]:
def get_data():
    actives = pd.read_csv(f"{INPUT_DATA_DIR}/actives.smi", sep=" ", names=["Smiles", "Ligand"])
    actives["Active"] = 1
    inactives = pd.read_csv(f"{INPUT_DATA_DIR}/inactives.smi", sep=" ", names=["Smiles", "Ligand"])
    inactives["Active"] = 0
    ligands_df = pd.concat([actives, inactives]).reset_index(drop=True)
    
    return ligands_df

# Fingerprint molecules
The whole set of fingerprints won't fit in memory (even sparse) so we have to save them as chunks. This iterates over the SMILES codes, generating fingerprint_matrices and score arrays, saving them as chunks of 10,000,000

In [20]:
@ray.remote
def create_fingerprint(smiles, score, i):
    if i % 10000 == 0:
        logging.basicConfig(level=logging.INFO)
        logging.info(i)
        
    mol = Chem.MolFromSmiles(smiles)
    pars = { "radius": 2,
             "nBits": 8192,
             "invariants": [],
             "fromAtoms": [],
             "useChirality": False,
             "useBondTypes": True,
             "useFeatures": True,
    }
    fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, **pars)
    onbits = list(fp.GetOnBits())

    return onbits, float(score)

In [23]:
def get_fingerprints(ligands_df, fp_size=8192):
    future_values = [create_fingerprint.remote(smiles=smiles, score=score, i=i) for (i, (smiles, score)) in enumerate(zip(ligands_df["Smiles"], ligands_df["Active"]))]
    
    values = [v for v in ray.get(future_values) if v]
    all_bits, scores = zip(*values)
    
    row_idx = []
    col_idx = []
    for i, bits in enumerate(all_bits):
        # these bits all have the same row:
        row_idx += [i] * len(bits)
        #and the column indices of those bits:
        col_idx += bits
    
    # generate a sparse matrix out of the row,col indices:
    unfolded_size = 8192
    fingerprint_matrix = sparse.coo_matrix((np.ones(len(row_idx)).astype(bool), 
                                           (row_idx, col_idx)), 
                                           shape=(max(row_idx)+1, unfolded_size))
    
    # convert to csr matrix, it is better:
    fingerprint_matrix =  sparse.csr_matrix(fingerprint_matrix)

    return fingerprint_matrix, scores

In [17]:
ligands_df = get_data()

In [24]:
fingerprint_matrix, scores = get_fingerprints(ligands_df=ligands_df)

[2m[36m(pid=9986)[0m INFO:root:0
[2m[36m(pid=9968)[0m INFO:root:10000
[2m[36m(pid=9966)[0m INFO:root:20000
[2m[36m(pid=9968)[0m INFO:root:30000
[2m[36m(pid=9978)[0m INFO:root:40000
[2m[36m(pid=9973)[0m INFO:root:50000
[2m[36m(pid=9974)[0m INFO:root:60000
[2m[36m(pid=9978)[0m INFO:root:70000
[2m[36m(pid=9972)[0m INFO:root:80000
[2m[36m(pid=9969)[0m INFO:root:90000
[2m[36m(pid=9981)[0m INFO:root:100000
[2m[36m(pid=9976)[0m INFO:root:110000
[2m[36m(pid=9972)[0m INFO:root:120000
[2m[36m(pid=9986)[0m INFO:root:130000
[2m[36m(pid=9966)[0m INFO:root:140000
[2m[36m(pid=9967)[0m INFO:root:150000
[2m[36m(pid=9970)[0m INFO:root:160000
[2m[36m(pid=9971)[0m INFO:root:170000
[2m[36m(pid=9977)[0m INFO:root:180000
[2m[36m(pid=9978)[0m INFO:root:190000
[2m[36m(pid=9973)[0m INFO:root:200000
[2m[36m(pid=9975)[0m INFO:root:210000
[2m[36m(pid=9971)[0m INFO:root:220000
[2m[36m(pid=9978)[0m INFO:root:230000
[2m[36m(pid=9986)[0m INFO:r

In [37]:
sparse.save_npz(f"{OUTPUT_DATA_DIR}/{RECEPTOR}_fingerprints.npz", fingerprint_matrix)
np.save(f"{OUTPUT_DATA_DIR}/{RECEPTOR}_scores.npy", np.array(scores))