In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tqdm
import ray
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors
from scipy import sparse
import logging

In [2]:
ray.init()

2021-05-26 12:09:05,641	INFO services.py:1267 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


{'node_ip_address': '192.168.68.111',
 'raylet_ip_address': '192.168.68.111',
 'redis_address': '192.168.68.111:6379',
 'object_store_address': '/tmp/ray/session_2021-05-26_12-09-04_239871_73533/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2021-05-26_12-09-04_239871_73533/sockets/raylet',
 'webui_url': '127.0.0.1:8265',
 'session_dir': '/tmp/ray/session_2021-05-26_12-09-04_239871_73533',
 'metrics_export_port': 65508,
 'node_id': '5ab98c405b79d9851aa4093b932e46195d5f1e9e99873163d9a5358a'}

# Fingerprint molecules
The whole set of fingerprints won't fit in memory (even sparse) so we have to save them as chunks. This iterates over the SMILES codes, generating fingerprint_matrices and score arrays, saving them as chunks of 10,000,000

In [4]:
@ray.remote
def parse(line, i):
    if i % 100 == 0:
        logging.basicConfig(level=logging.INFO)
        logging.info(i)
    
    
    if line == "":
        return
    
    zinc_id, smiles, score = line.split(",")
    if score == "no_score":
        return
    
    mol = Chem.MolFromSmiles(smiles)
    pars = { "radius": 2,
             "nBits": 8192,
             "invariants": [],
             "fromAtoms": [],
             "useChirality": False,
             "useBondTypes": True,
             "useFeatures": True,
    }
    fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, **pars)
    onbits = list(fp.GetOnBits())

    return onbits, float(score)

In [5]:
def get_fingerprints(fname, outFileName, fpSize=8192):
    with open(fname) as f:
        lines = f.read().split("\n")[1:]
        future_values = [parse.remote(line, i) for (i, line) in enumerate(lines)]
    
    values = [v for v in ray.get(future_values) if v]
    all_bits, scores = zip(*values)
    
    row_idx = []
    col_idx = []
    for i, bits in enumerate(all_bits):
        # these bits all have the same row:
        row_idx += [i] * len(bits)
        #and the column indices of those bits:
        col_idx += bits
    
    # generate a sparse matrix out of the row,col indices:
    unfolded_size = 8192
    fingerprint_matrix = sparse.coo_matrix((np.ones(len(row_idx)).astype(bool), (row_idx, col_idx)), 
                          shape=(max(row_idx)+1, unfolded_size))
    
    # convert to csr matrix, it is better:
    fingerprint_matrix =  sparse.csr_matrix(fingerprint_matrix)

    sparse.save_npz(outFileName+'.npz', fingerprint_matrix)
    np.save(outFileName+'.npy', np.array(scores))

# Count number of valid molecules:

In [6]:
fname = '../data/AmpC_screen_table.csv'

In [7]:
def count_valid_molecules(fname):
    with open(fname) as fileobj:
        fileobj.readline()
        count = 0
        for line in fileobj:
            words = line[:-1].split(',')
            if len(words[2]) < 1:
                continue
            if words[2] == 'no_score':
                break
            count += 1
        fileobj.close()

In [None]:
# count_valid_molecules(fname)

In [None]:
# for i in range( np.ceil(count / chunksize).astype(int) ):
get_fingerprints(fname, '../processed_data/AmpC_all')

[2m[36m(pid=73563)[0m INFO:root:0
[2m[36m(pid=73564)[0m INFO:root:200
[2m[36m(pid=73553)[0m INFO:root:100
[2m[36m(pid=73561)[0m INFO:root:300
[2m[36m(pid=73563)[0m INFO:root:400
[2m[36m(pid=73563)[0m INFO:root:500
[2m[36m(pid=73555)[0m INFO:root:600
[2m[36m(pid=73553)[0m INFO:root:800
[2m[36m(pid=73553)[0m INFO:root:900
[2m[36m(pid=73555)[0m INFO:root:700
[2m[36m(pid=73566)[0m INFO:root:1000
[2m[36m(pid=73552)[0m INFO:root:1100
[2m[36m(pid=73552)[0m INFO:root:1200
[2m[36m(pid=73557)[0m INFO:root:1300
[2m[36m(pid=73558)[0m INFO:root:1400
[2m[36m(pid=73554)[0m INFO:root:1500
[2m[36m(pid=73553)[0m INFO:root:1600
[2m[36m(pid=73552)[0m INFO:root:1800
[2m[36m(pid=73558)[0m INFO:root:1900
[2m[36m(pid=73562)[0m INFO:root:1700
[2m[36m(pid=73555)[0m INFO:root:2000
[2m[36m(pid=73560)[0m INFO:root:2100
[2m[36m(pid=73566)[0m INFO:root:2400
[2m[36m(pid=73560)[0m INFO:root:2300
[2m[36m(pid=73559)[0m INFO:root:2200
[2m[36m(pi

KeyboardInterrupt: 

Exception ignored in: 'ray._raylet.get_py_stack'
Traceback (most recent call last):
  File "/Users/ricomeinl/miniconda3/envs/dockop/lib/python3.8/inspect.py", line 1520, in currentframe
    def currentframe():
KeyboardInterrupt: 
[2m[36m(pid=73558)[0m INFO:root:458500
[2m[36m(pid=73564)[0m INFO:root:458800
[2m[36m(pid=73555)[0m INFO:root:458600
[2m[36m(pid=73560)[0m INFO:root:458700
[2m[36m(pid=73564)[0m INFO:root:459000
[2m[36m(pid=73553)[0m INFO:root:459200
[2m[36m(pid=73557)[0m INFO:root:458900
[2m[36m(pid=73556)[0m INFO:root:459100
[2m[36m(pid=73558)[0m INFO:root:459400
[2m[36m(pid=73567)[0m INFO:root:459300
[2m[36m(pid=73558)[0m INFO:root:459700
[2m[36m(pid=73559)[0m INFO:root:459500
[2m[36m(pid=73559)[0m INFO:root:459600
[2m[36m(pid=73555)[0m INFO:root:459800
[2m[36m(pid=73566)[0m INFO:root:460000
[2m[36m(pid=73557)[0m INFO:root:459900
[2m[36m(pid=73561)[0m INFO:root:460100
[2m[36m(pid=73554)[0m INFO:root:460300
[2m[36m(pid=