In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tqdm
import ray
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors
from scipy import sparse
import logging

In [2]:
ray.init()

2021-05-26 12:57:24,579	INFO services.py:1267 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


{'node_ip_address': '172.31.36.235',
 'raylet_ip_address': '172.31.36.235',
 'redis_address': '172.31.36.235:6379',
 'object_store_address': '/tmp/ray/session_2021-05-26_12-57-23_694130_28713/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2021-05-26_12-57-23_694130_28713/sockets/raylet',
 'webui_url': '127.0.0.1:8265',
 'session_dir': '/tmp/ray/session_2021-05-26_12-57-23_694130_28713',
 'metrics_export_port': 53348,
 'node_id': '53244dc734d6b85951d051190d7e5566f353d2fe35cea4b3912cef20'}

In [3]:
DATA_DIR = "/mnt/efs/AmpC_data"

# Fingerprint molecules
The whole set of fingerprints won't fit in memory (even sparse) so we have to save them as chunks. This iterates over the SMILES codes, generating fingerprint_matrices and score arrays, saving them as chunks of 10,000,000

In [6]:
@ray.remote
def parse(line, i):
    if i % 1000 == 0:
        logging.basicConfig(level=logging.INFO)
        logging.info(i)
    
    
    if line == "":
        return
    
    zinc_id, smiles, score = line.split(",")
    if score == "no_score":
        return
    
    mol = Chem.MolFromSmiles(smiles)
    pars = { "radius": 2,
             "nBits": 8192,
             "invariants": [],
             "fromAtoms": [],
             "useChirality": False,
             "useBondTypes": True,
             "useFeatures": True,
    }
    fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, **pars)
    onbits = list(fp.GetOnBits())

    return onbits, float(score)

In [7]:
def get_fingerprints(fname, outFileName, fpSize=8192):
    with open(fname) as f:
        lines = f.read().split("\n")[1:]
        future_values = [parse.remote(line, i) for (i, line) in enumerate(lines)]
    
    values = [v for v in ray.get(future_values) if v]
    all_bits, scores = zip(*values)
    
    row_idx = []
    col_idx = []
    for i, bits in enumerate(all_bits):
        # these bits all have the same row:
        row_idx += [i] * len(bits)
        #and the column indices of those bits:
        col_idx += bits
    
    # generate a sparse matrix out of the row,col indices:
    unfolded_size = 8192
    fingerprint_matrix = sparse.coo_matrix((np.ones(len(row_idx)).astype(bool), (row_idx, col_idx)), 
                          shape=(max(row_idx)+1, unfolded_size))
    
    # convert to csr matrix, it is better:
    fingerprint_matrix =  sparse.csr_matrix(fingerprint_matrix)

    sparse.save_npz(outFileName+'.npz', fingerprint_matrix)
    np.save(outFileName+'.npy', np.array(scores))

# Count number of valid molecules:

In [8]:
fname = f"{DATA_DIR}/data/AmpC_screen_table.csv"

In [9]:
def count_valid_molecules(fname):
    with open(fname) as fileobj:
        fileobj.readline()
        count = 0
        for line in fileobj:
            words = line[:-1].split(',')
            if len(words[2]) < 1:
                continue
            if words[2] == 'no_score':
                break
            count += 1
        fileobj.close()

In [10]:
# count_valid_molecules(fname=fname)

In [None]:
# for i in range( np.ceil(count / chunksize).astype(int) ):
get_fingerprints(fname, f"{DATA_DIR}/processed_data/AmpC_all")

[2m[36m(pid=28856)[0m INFO:root:0
[2m[36m(pid=28854)[0m INFO:root:1000
[2m[36m(pid=28853)[0m INFO:root:2000
[2m[36m(pid=28852)[0m INFO:root:3000
[2m[36m(pid=28849)[0m INFO:root:4000
[2m[36m(pid=28866)[0m INFO:root:5000
[2m[36m(pid=28850)[0m INFO:root:6000
[2m[36m(pid=28857)[0m INFO:root:7000
[2m[36m(pid=28855)[0m INFO:root:8000
[2m[36m(pid=28855)[0m INFO:root:9000
[2m[36m(pid=28862)[0m INFO:root:10000
[2m[36m(pid=28855)[0m INFO:root:11000
[2m[36m(pid=28852)[0m INFO:root:12000
[2m[36m(pid=28853)[0m INFO:root:13000
[2m[36m(pid=28850)[0m INFO:root:14000
[2m[36m(pid=28866)[0m INFO:root:15000
[2m[36m(pid=28852)[0m INFO:root:16000
[2m[36m(pid=28855)[0m INFO:root:17000
[2m[36m(pid=28852)[0m INFO:root:18000
[2m[36m(pid=28854)[0m INFO:root:19000
[2m[36m(pid=28863)[0m INFO:root:20000
[2m[36m(pid=28861)[0m INFO:root:21000
[2m[36m(pid=28856)[0m INFO:root:22000
[2m[36m(pid=28856)[0m INFO:root:23000
[2m[36m(pid=28860)[0m INFO

[2m[36m(pid=28852)[0m INFO:root:199000
[2m[36m(pid=28862)[0m INFO:root:200000
[2m[36m(pid=28857)[0m INFO:root:201000
[2m[36m(pid=28853)[0m INFO:root:202000
[2m[36m(pid=28852)[0m INFO:root:203000
[2m[36m(pid=28863)[0m INFO:root:204000
[2m[36m(pid=28861)[0m INFO:root:205000
[2m[36m(pid=28853)[0m INFO:root:206000
[2m[36m(pid=28862)[0m INFO:root:207000
[2m[36m(pid=28851)[0m INFO:root:208000
[2m[36m(pid=28850)[0m INFO:root:209000
[2m[36m(pid=28853)[0m INFO:root:210000
[2m[36m(pid=28849)[0m INFO:root:211000
[2m[36m(pid=28852)[0m INFO:root:212000
[2m[36m(pid=28855)[0m INFO:root:213000
[2m[36m(pid=28850)[0m INFO:root:214000
[2m[36m(pid=28855)[0m INFO:root:215000
[2m[36m(pid=28866)[0m INFO:root:216000
[2m[36m(pid=28866)[0m INFO:root:217000
[2m[36m(pid=28863)[0m INFO:root:218000
[2m[36m(pid=28863)[0m INFO:root:219000
[2m[36m(pid=28854)[0m INFO:root:220000
[2m[36m(pid=28863)[0m INFO:root:221000
[2m[36m(pid=28852)[0m INFO:root

[2m[36m(pid=28866)[0m INFO:root:395000
[2m[36m(pid=28853)[0m INFO:root:396000
[2m[36m(pid=28861)[0m INFO:root:397000
[2m[36m(pid=28855)[0m INFO:root:398000
[2m[36m(pid=28860)[0m INFO:root:399000
[2m[36m(pid=28863)[0m INFO:root:400000
[2m[36m(pid=28860)[0m INFO:root:401000
[2m[36m(pid=28852)[0m INFO:root:402000
[2m[36m(pid=28849)[0m INFO:root:403000
[2m[36m(pid=28851)[0m INFO:root:404000
[2m[36m(pid=28859)[0m INFO:root:405000
[2m[36m(pid=28860)[0m INFO:root:406000
[2m[36m(pid=28860)[0m INFO:root:407000
[2m[36m(pid=28857)[0m INFO:root:408000
[2m[36m(pid=28860)[0m INFO:root:409000
[2m[36m(pid=28855)[0m INFO:root:410000
[2m[36m(pid=28852)[0m INFO:root:411000
[2m[36m(pid=28855)[0m INFO:root:412000
[2m[36m(pid=28861)[0m INFO:root:413000
[2m[36m(pid=28861)[0m INFO:root:414000
[2m[36m(pid=28857)[0m INFO:root:415000
[2m[36m(pid=28851)[0m INFO:root:416000
[2m[36m(pid=28858)[0m INFO:root:417000
[2m[36m(pid=28855)[0m INFO:root

[2m[36m(pid=28866)[0m INFO:root:593000
[2m[36m(pid=28859)[0m INFO:root:594000
[2m[36m(pid=28857)[0m INFO:root:595000
[2m[36m(pid=28863)[0m INFO:root:596000
[2m[36m(pid=28860)[0m INFO:root:597000
[2m[36m(pid=28851)[0m INFO:root:598000
[2m[36m(pid=28861)[0m INFO:root:599000
[2m[36m(pid=28853)[0m INFO:root:600000
[2m[36m(pid=28862)[0m INFO:root:601000
[2m[36m(pid=28856)[0m INFO:root:602000
[2m[36m(pid=28858)[0m INFO:root:603000
[2m[36m(pid=28849)[0m INFO:root:604000
[2m[36m(pid=28855)[0m INFO:root:605000
[2m[36m(pid=28854)[0m INFO:root:606000
[2m[36m(pid=28866)[0m INFO:root:607000
[2m[36m(pid=28854)[0m INFO:root:608000
[2m[36m(pid=28858)[0m INFO:root:609000
[2m[36m(pid=28849)[0m INFO:root:610000
[2m[36m(pid=28855)[0m INFO:root:611000
[2m[36m(pid=28855)[0m INFO:root:612000
[2m[36m(pid=28855)[0m INFO:root:613000
[2m[36m(pid=28860)[0m INFO:root:614000
[2m[36m(pid=28862)[0m INFO:root:615000
[2m[36m(pid=28858)[0m INFO:root

[2m[36m(pid=28859)[0m INFO:root:790000
[2m[36m(pid=28857)[0m INFO:root:791000
[2m[36m(pid=28858)[0m INFO:root:792000
[2m[36m(pid=28850)[0m INFO:root:793000
[2m[36m(pid=28854)[0m INFO:root:794000
[2m[36m(pid=28854)[0m INFO:root:795000
[2m[36m(pid=28863)[0m INFO:root:796000
[2m[36m(pid=28860)[0m INFO:root:797000
[2m[36m(pid=28860)[0m INFO:root:798000
[2m[36m(pid=28850)[0m INFO:root:799000
[2m[36m(pid=28857)[0m INFO:root:800000
[2m[36m(pid=28857)[0m INFO:root:801000
[2m[36m(pid=28854)[0m INFO:root:802000
[2m[36m(pid=28859)[0m INFO:root:803000
[2m[36m(pid=28857)[0m INFO:root:804000
[2m[36m(pid=28853)[0m INFO:root:805000
[2m[36m(pid=28857)[0m INFO:root:806000
[2m[36m(pid=28852)[0m INFO:root:807000
[2m[36m(pid=28853)[0m INFO:root:808000
[2m[36m(pid=28849)[0m INFO:root:809000
[2m[36m(pid=28849)[0m INFO:root:810000
[2m[36m(pid=28853)[0m INFO:root:811000
[2m[36m(pid=28863)[0m INFO:root:812000
[2m[36m(pid=28866)[0m INFO:root

[2m[36m(pid=28862)[0m INFO:root:986000
[2m[36m(pid=28866)[0m INFO:root:987000
[2m[36m(pid=28866)[0m INFO:root:988000
[2m[36m(pid=28855)[0m INFO:root:989000
[2m[36m(pid=28852)[0m INFO:root:990000
[2m[36m(pid=28851)[0m INFO:root:991000
[2m[36m(pid=28852)[0m INFO:root:992000
[2m[36m(pid=28863)[0m INFO:root:993000
[2m[36m(pid=28855)[0m INFO:root:994000
[2m[36m(pid=28859)[0m INFO:root:995000
[2m[36m(pid=28857)[0m INFO:root:996000
[2m[36m(pid=28857)[0m INFO:root:997000
[2m[36m(pid=28858)[0m INFO:root:998000
[2m[36m(pid=28860)[0m INFO:root:999000
[2m[36m(pid=28860)[0m INFO:root:1000000
[2m[36m(pid=28859)[0m INFO:root:1001000
[2m[36m(pid=28855)[0m INFO:root:1002000
[2m[36m(pid=28850)[0m INFO:root:1003000
[2m[36m(pid=28856)[0m INFO:root:1004000
[2m[36m(pid=28853)[0m INFO:root:1005000
[2m[36m(pid=28852)[0m INFO:root:1006000
[2m[36m(pid=28858)[0m INFO:root:1007000
[2m[36m(pid=28849)[0m INFO:root:1008000
[2m[36m(pid=28862)[0m 

[2m[36m(pid=28852)[0m INFO:root:1178000
[2m[36m(pid=28859)[0m INFO:root:1179000
[2m[36m(pid=28852)[0m INFO:root:1180000
[2m[36m(pid=28860)[0m INFO:root:1181000
[2m[36m(pid=28853)[0m INFO:root:1182000
[2m[36m(pid=28866)[0m INFO:root:1183000
[2m[36m(pid=28860)[0m INFO:root:1184000
[2m[36m(pid=28853)[0m INFO:root:1185000
[2m[36m(pid=28852)[0m INFO:root:1186000
[2m[36m(pid=28849)[0m INFO:root:1187000
[2m[36m(pid=28854)[0m INFO:root:1188000
[2m[36m(pid=28850)[0m INFO:root:1189000
[2m[36m(pid=28849)[0m INFO:root:1190000
[2m[36m(pid=28858)[0m INFO:root:1191000
[2m[36m(pid=28859)[0m INFO:root:1192000
[2m[36m(pid=28857)[0m INFO:root:1193000
[2m[36m(pid=28862)[0m INFO:root:1194000
[2m[36m(pid=28860)[0m INFO:root:1195000
[2m[36m(pid=28853)[0m INFO:root:1196000
[2m[36m(pid=28856)[0m INFO:root:1197000
[2m[36m(pid=28862)[0m INFO:root:1198000
[2m[36m(pid=28850)[0m INFO:root:1199000
[2m[36m(pid=28860)[0m INFO:root:1200000
[2m[36m(p

[2m[36m(pid=28859)[0m INFO:root:1370000
[2m[36m(pid=28856)[0m INFO:root:1371000
[2m[36m(pid=28860)[0m INFO:root:1372000
[2m[36m(pid=28866)[0m INFO:root:1373000
[2m[36m(pid=28866)[0m INFO:root:1374000
[2m[36m(pid=28859)[0m INFO:root:1375000
[2m[36m(pid=28866)[0m INFO:root:1376000
[2m[36m(pid=28866)[0m INFO:root:1377000
[2m[36m(pid=28859)[0m INFO:root:1378000
[2m[36m(pid=28860)[0m INFO:root:1379000
[2m[36m(pid=28866)[0m INFO:root:1380000
[2m[36m(pid=28855)[0m INFO:root:1381000
[2m[36m(pid=28863)[0m INFO:root:1382000
[2m[36m(pid=28860)[0m INFO:root:1383000
[2m[36m(pid=28861)[0m INFO:root:1384000
[2m[36m(pid=28856)[0m INFO:root:1385000
[2m[36m(pid=28857)[0m INFO:root:1386000
[2m[36m(pid=28859)[0m INFO:root:1387000
[2m[36m(pid=28856)[0m INFO:root:1388000
[2m[36m(pid=28851)[0m INFO:root:1389000
[2m[36m(pid=28858)[0m INFO:root:1390000
[2m[36m(pid=28855)[0m INFO:root:1391000
[2m[36m(pid=28856)[0m INFO:root:1392000
[2m[36m(p

[2m[36m(pid=28863)[0m INFO:root:1563000
[2m[36m(pid=28858)[0m INFO:root:1564000
[2m[36m(pid=28857)[0m INFO:root:1565000
[2m[36m(pid=28862)[0m INFO:root:1566000
[2m[36m(pid=28866)[0m INFO:root:1567000
[2m[36m(pid=28853)[0m INFO:root:1568000
[2m[36m(pid=28860)[0m INFO:root:1569000
[2m[36m(pid=28858)[0m INFO:root:1570000
[2m[36m(pid=28849)[0m INFO:root:1571000
[2m[36m(pid=28849)[0m INFO:root:1572000
[2m[36m(pid=28851)[0m INFO:root:1573000
[2m[36m(pid=28853)[0m INFO:root:1574000
[2m[36m(pid=28849)[0m INFO:root:1575000
[2m[36m(pid=28850)[0m INFO:root:1576000
[2m[36m(pid=28854)[0m INFO:root:1577000
[2m[36m(pid=28857)[0m INFO:root:1578000
[2m[36m(pid=28862)[0m INFO:root:1579000
[2m[36m(pid=28862)[0m INFO:root:1580000
[2m[36m(pid=28859)[0m INFO:root:1581000
[2m[36m(pid=28851)[0m INFO:root:1582000
[2m[36m(pid=28866)[0m INFO:root:1583000
[2m[36m(pid=28849)[0m INFO:root:1584000
[2m[36m(pid=28854)[0m INFO:root:1585000
[2m[36m(p

[2m[36m(pid=28856)[0m INFO:root:1754000
[2m[36m(pid=28854)[0m INFO:root:1755000
[2m[36m(pid=28861)[0m INFO:root:1756000
[2m[36m(pid=28859)[0m INFO:root:1757000
[2m[36m(pid=28853)[0m INFO:root:1758000
[2m[36m(pid=28851)[0m INFO:root:1759000
[2m[36m(pid=28859)[0m INFO:root:1760000
[2m[36m(pid=28854)[0m INFO:root:1761000
[2m[36m(pid=28853)[0m INFO:root:1762000
[2m[36m(pid=28856)[0m INFO:root:1763000
[2m[36m(pid=28859)[0m INFO:root:1764000
[2m[36m(pid=28854)[0m INFO:root:1765000
[2m[36m(pid=28857)[0m INFO:root:1766000
[2m[36m(pid=28866)[0m INFO:root:1767000
[2m[36m(pid=28866)[0m INFO:root:1768000
[2m[36m(pid=28849)[0m INFO:root:1769000
[2m[36m(pid=28855)[0m INFO:root:1770000
[2m[36m(pid=28858)[0m INFO:root:1771000
[2m[36m(pid=28853)[0m INFO:root:1772000
[2m[36m(pid=28855)[0m INFO:root:1773000
[2m[36m(pid=28851)[0m INFO:root:1774000
[2m[36m(pid=28852)[0m INFO:root:1775000
[2m[36m(pid=28849)[0m INFO:root:1776000
[2m[36m(p

[2m[36m(pid=28855)[0m INFO:root:1946000
[2m[36m(pid=28853)[0m INFO:root:1947000
[2m[36m(pid=28855)[0m INFO:root:1948000
[2m[36m(pid=28866)[0m INFO:root:1949000
[2m[36m(pid=28862)[0m INFO:root:1950000
[2m[36m(pid=28863)[0m INFO:root:1951000
[2m[36m(pid=28855)[0m INFO:root:1952000
[2m[36m(pid=28853)[0m INFO:root:1953000
[2m[36m(pid=28860)[0m INFO:root:1954000
[2m[36m(pid=28855)[0m INFO:root:1955000
[2m[36m(pid=28856)[0m INFO:root:1956000
[2m[36m(pid=28858)[0m INFO:root:1957000
[2m[36m(pid=28861)[0m INFO:root:1958000
[2m[36m(pid=28860)[0m INFO:root:1959000
[2m[36m(pid=28866)[0m INFO:root:1960000
[2m[36m(pid=28849)[0m INFO:root:1961000
[2m[36m(pid=28849)[0m INFO:root:1962000
[2m[36m(pid=28851)[0m INFO:root:1963000
[2m[36m(pid=28861)[0m INFO:root:1964000
[2m[36m(pid=28858)[0m INFO:root:1965000
[2m[36m(pid=28854)[0m INFO:root:1966000
[2m[36m(pid=28863)[0m INFO:root:1967000
[2m[36m(pid=28866)[0m INFO:root:1968000
[2m[36m(p

In [16]:
!ls /mnt/efs/AmpC_data/data

AmpC_screen_table.csv  AmpC_screen_table.csv.zip
