In [34]:
import time
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tqdm
import ray
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors
from scipy import sparse
import logging
from functools import lru_cache
from mol2vec.features import mol2alt_sentence, sentences2vec
from gensim.models.word2vec import Word2Vec

In [22]:
ray.init()

2021-05-27 10:01:22,539	INFO services.py:1267 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


{'node_ip_address': '172.31.35.146',
 'raylet_ip_address': '172.31.35.146',
 'redis_address': '172.31.35.146:6379',
 'object_store_address': '/tmp/ray/session_2021-05-27_10-01-21_621651_7371/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2021-05-27_10-01-21_621651_7371/sockets/raylet',
 'webui_url': '127.0.0.1:8265',
 'session_dir': '/tmp/ray/session_2021-05-27_10-01-21_621651_7371',
 'metrics_export_port': 57843,
 'node_id': 'f3b45bed8f794fbe9963d6c72fdbb842cb1568390c535ac8a372ed62'}

In [23]:
NUM_CHUNKS = 1

In [41]:
USE_EMBEDDINGS = False

In [42]:
RECEPTOR = "EnamineHTS"
DATA_DIR = "/mnt/efs/enamine"
INPUT_DATA = f"{DATA_DIR}/{RECEPTOR}_scores.csv"

MODEL_PATH = "/home/ubuntu/mol2vec/examples/models/model_300dim.pkl"
UNCOMMON = "UNK"

In [26]:
@lru_cache(maxsize=2)
def get_data():
    ligands_df = pd.read_csv(INPUT_DATA)
    
    return ligands_df

In [27]:
@lru_cache(maxsize=2)
def get_w2v_model():
    word2vec_model = Word2Vec.load(MODEL_PATH)
    word2vec_model.wv.init_sims()
    return word2vec_model

In [43]:
@ray.remote
def create_fingerprint(smiles, score, i):
    if i % 10000 == 0:
        logging.basicConfig(level=logging.INFO)
        logging.info(i)
        
    mol = Chem.MolFromSmiles(smiles)
    pars = { "radius": 2,
             "nBits": 8192,
             "invariants": [],
             "fromAtoms": [],
             "useChirality": False,
             "useBondTypes": True,
             "useFeatures": True,
    }
    fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, **pars)
    onbits = list(fp.GetOnBits())

    return onbits, float(score)

In [44]:
@ray.remote
def create_mol_sentence(smiles, score, r, i):
    if i % 10000 == 0:
        logging.basicConfig(level=logging.INFO)
        logging.info(i)
        
    mol = Chem.MolFromSmiles(smiles)
    # smiles = Chem.MolToSmiles(mol)
    
    if not mol:
        return
    
    alt_sentence = mol2alt_sentence(mol, radius=r)
    
    return alt_sentence, score

In [45]:
def get_fingerprints(ligands_df, fp_size=8192):
    future_values = [create_fingerprint.remote(smiles=smiles, score=score, i=i) for (i, (smiles, score)) in enumerate(zip(ligands_df["smiles"], ligands_df["score"]))]
    
    values = [v for v in ray.get(future_values) if v]
    all_bits, scores = zip(*values)
    
    row_idx = []
    col_idx = []
    for i, bits in enumerate(all_bits):
        # these bits all have the same row:
        row_idx += [i] * len(bits)
        #and the column indices of those bits:
        col_idx += bits
    
    # generate a sparse matrix out of the row,col indices:
    unfolded_size = 8192
    fingerprint_matrix = sparse.coo_matrix((np.ones(len(row_idx)).astype(bool), 
                                           (row_idx, col_idx)), 
                                           shape=(max(row_idx)+1, unfolded_size))
    
    # convert to csr matrix, it is better:
    fingerprint_matrix =  sparse.csr_matrix(fingerprint_matrix)

    return fingerprint_matrix, scores

In [46]:
def get_embeddings(ligands_df, model, radius=1):
    future_values = [create_mol_sentence.remote(smiles=smiles, score=score, r=radius, i=i) for (i, (smiles, score)) in enumerate(zip(ligands_df["smiles"], ligands_df["score"]))]
    
    values = [v for v in ray.get(future_values) if v]
    mol_sentences, scores = zip(*values)

#     vectors = sentences2vec(sentences=mol_sentences, model=model, unseen=UNCOMMON)

    return mol_sentences, scores

In [47]:
ligands_df = get_data()

In [37]:
word2vec_model = get_w2v_model()

  word2vec_model.wv.init_sims()


In [48]:
start = time.time()
for i, df_chunk in enumerate(np.array_split(ligands_df, NUM_CHUNKS)):
    if USE_EMBEDDINGS:
        print("Generating mol2vec embeddings...")
        embeddings, scores = get_embeddings(ligands_df=df_chunk, model=word2vec_model, radius=1)
        vectors = sentences2vec(sentences=embeddings, model=word2vec_model, unseen=UNCOMMON)
        np.save(f"{DATA_DIR}/{RECEPTOR}_embeddings_{i}.npy", vectors)
        np.save(f"{DATA_DIR}/{RECEPTOR}_embedding_scores_{i}.npy", np.array(scores))
    else:
        print("Generating Morgan Fingerprints...")
        fingerprint_matrix, scores = get_fingerprints(ligands_df=df_chunk)
        sparse.save_npz(f"{DATA_DIR}/{RECEPTOR}_fingerprints_{i}.npz", fingerprint_matrix)
        np.save(f"{DATA_DIR}/{RECEPTOR}_scores_{i}.npy", np.array(scores))
    print(f"Chunk {i} took: {(time.time() - start)/60} mins")
    
print(f"Dataset took: {(time.time() - start)/60} mins")

Generating Morgan Fingerprints...


[2m[36m(pid=7876)[0m INFO:root:0
[2m[36m(pid=7932)[0m INFO:root:10000
[2m[36m(pid=7937)[0m INFO:root:20000
[2m[36m(pid=7892)[0m INFO:root:30000
[2m[36m(pid=7885)[0m INFO:root:40000
[2m[36m(pid=7922)[0m INFO:root:50000
[2m[36m(pid=7921)[0m INFO:root:60000
[2m[36m(pid=7889)[0m INFO:root:70000
[2m[36m(pid=7894)[0m INFO:root:80000
[2m[36m(pid=7923)[0m INFO:root:90000
[2m[36m(pid=7886)[0m INFO:root:100000
[2m[36m(pid=7884)[0m INFO:root:110000
[2m[36m(pid=7894)[0m INFO:root:120000
[2m[36m(pid=7929)[0m INFO:root:130000
[2m[36m(pid=7940)[0m INFO:root:140000
[2m[36m(pid=7891)[0m INFO:root:150000
[2m[36m(pid=7934)[0m INFO:root:160000
[2m[36m(pid=7886)[0m INFO:root:170000
[2m[36m(pid=7929)[0m INFO:root:180000
[2m[36m(pid=7875)[0m INFO:root:190000
[2m[36m(pid=7934)[0m INFO:root:200000
[2m[36m(pid=7893)[0m INFO:root:210000
[2m[36m(pid=7929)[0m INFO:root:220000
[2m[36m(pid=7925)[0m INFO:root:230000
[2m[36m(pid=7921)[0m INFO:r

[2m[36m(pid=7894)[0m INFO:root:1980000
[2m[36m(pid=7890)[0m INFO:root:1990000
[2m[36m(pid=7919)[0m INFO:root:2000000
[2m[36m(pid=7895)[0m INFO:root:2010000
[2m[36m(pid=7878)[0m INFO:root:2020000
[2m[36m(pid=7882)[0m INFO:root:2030000
[2m[36m(pid=7879)[0m INFO:root:2040000
[2m[36m(pid=7893)[0m INFO:root:2050000
[2m[36m(pid=7929)[0m INFO:root:2060000
[2m[36m(pid=7885)[0m INFO:root:2070000
[2m[36m(pid=7878)[0m INFO:root:2080000
[2m[36m(pid=7888)[0m INFO:root:2090000
[2m[36m(pid=7888)[0m INFO:root:2100000
