In [35]:
import time
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tqdm
import ray
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors
from scipy import sparse
import logging
from functools import lru_cache
from mol2vec.features import mol2alt_sentence, sentences2vec
from gensim.models.word2vec import Word2Vec

In [2]:
NUM_CPUS = 16

In [3]:
ray.init(num_cpus=NUM_CPUS)

2021-05-28 12:51:11,131	INFO services.py:1267 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


{'node_ip_address': '172.31.43.114',
 'raylet_ip_address': '172.31.43.114',
 'redis_address': '172.31.43.114:6379',
 'object_store_address': '/tmp/ray/session_2021-05-28_12-51-09_636659_18091/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2021-05-28_12-51-09_636659_18091/sockets/raylet',
 'webui_url': '127.0.0.1:8265',
 'session_dir': '/tmp/ray/session_2021-05-28_12-51-09_636659_18091',
 'metrics_export_port': 50505,
 'node_id': 'd62f6e682993c77e1e066cabfca6132e7815fddca5cfc24b3923489e'}

In [4]:
NUM_CHUNKS = 1

In [5]:
USE_EMBEDDINGS = False

In [6]:
RECEPTOR = "EnamineHTS"
DATA_DIR = "/mnt/efs/enamine"
INPUT_DATA = f"{DATA_DIR}/{RECEPTOR}_scores.csv"

MODEL_PATH = "/mnt/efs/mol2vec/examples/models/model_300dim.pkl"
UNCOMMON = "UNK"

In [7]:
def get_data():
    ligands_df = pd.read_csv(INPUT_DATA)
    
    return ligands_df

In [8]:
@lru_cache(maxsize=2)
def get_w2v_model():
    word2vec_model = Word2Vec.load(MODEL_PATH)
    word2vec_model.wv.init_sims()
    return word2vec_model

In [9]:
def create_fingerprint(smiles, score, i, radius=2, n_bits=8192):
    if i % 10000 == 0:
        logging.basicConfig(level=logging.INFO)
        logging.info(i)
        
    mol = Chem.MolFromSmiles(smiles)
    pars = { 
        "radius": radius,
        "nBits": n_bits,
        "invariants": [],
        "fromAtoms": [],
        "useChirality": False,
        "useBondTypes": True,
        "useFeatures": True,
    }
    fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, **pars)
    onbits = list(fp.GetOnBits())
    
    # using radius of 1
    alt_sentence = mol2alt_sentence(mol, radius=1)

    return onbits, alt_sentence, float(score)

In [10]:
@ray.remote
def create_mol_sentence(smiles, score, i, radius=1):
    if i % 10000 == 0:
        logging.basicConfig(level=logging.INFO)
        logging.info(i)
        
    mol = Chem.MolFromSmiles(smiles)
    # smiles = Chem.MolToSmiles(mol)
    
    alt_sentence = mol2alt_sentence(mol, radius=radius)
    
    return alt_sentence, float(score)

In [11]:
@ray.remote
def create_fingerprint_batched(batches, radius=2, n_bits=8192):
    bits_list = []
    sentence_list = []
    score_list = []
    for i, batch in enumerate(batches):
        smiles, score = batch
        
        onbits, alt_sentence, score = create_fingerprint(smiles, score, i)
        
        if not USE_EMBEDDINGS:
            bits_list.append(onbits)
        sentence_list.append(alt_sentence)
        score_list.append(score)

    return bits_list, sentence_list, score_list

In [12]:
def flatten(lst):
    return [item for batch in lst for item in batch]

In [13]:
def get_fingerprints(ligands_df, fp_size=8192, smiles_col="smiles", score_col="score"):
    future_values = []
    for df_chunk in np.array_split(ligands_df, NUM_CPUS):
        future_values.append(create_fingerprint_batched.remote(zip(df_chunk[smiles_col], df_chunk[score_col])))

    values = ray.get(future_values)
    
    all_bits, alt_sentences, scores = zip(*values)
    
    alt_sentences = flatten(alt_sentences)
    scores = flatten(scores)
    
    fingerprint_matrix = None
    if not USE_EMBEDDINGS:
        all_bits = flatten(all_bits)

        row_idx = []
        col_idx = []
        for i, bits in enumerate(all_bits):
            # these bits all have the same row:
            row_idx += [i] * len(bits)
            #and the column indices of those bits:
            col_idx += bits

        # generate a sparse matrix out of the row,col indices:
        fingerprint_matrix = sparse.coo_matrix((np.ones(len(row_idx)).astype(bool), 
                                               (row_idx, col_idx)), 
                                               shape=(max(row_idx)+1, fp_size))

        # convert to csr matrix, it is better:
        fingerprint_matrix =  sparse.csr_matrix(fingerprint_matrix)

    return alt_sentences, fingerprint_matrix, scores

In [14]:
def get_embeddings(ligands_df, model, radius=1):
    future_values = [create_mol_sentence.remote(smiles=smiles, score=score, r=radius, i=i) for (i, (smiles, score)) in enumerate(zip(ligands_df["smiles"], ligands_df["score"]))]
    
    values = [v for v in ray.get(future_values) if v]
    mol_sentences, scores = zip(*values)

#     vectors = sentences2vec(sentences=mol_sentences, model=model, unseen=UNCOMMON)

    return mol_sentences, scores

In [29]:
@lru_cache(maxsize=50_000)
def get_vector_cached(model, query, unseen):
    try:
        return model.wv.get_vector(query)
    except:
        return model.wv.get_vector(unseen)

In [30]:
def sentences2vec(sentences, model, unseen=None):
    keys = set(model.wv.key_to_index)
    vec = []

    for sentence in sentences:
        if unseen:
            vec.append(sum([get_vector_cached(model, query=y, unseen=unseen) for y in sentence]))
        else:
            vec.append(sum([get_vector_cached(model, query=y, unseen=unseen) for y in sentence 
                            if y in set(sentence) & keys]))
    return np.array(vec, dtype=np.float32)

In [17]:
ligands_df = get_data()

In [18]:
word2vec_model = get_w2v_model()

  word2vec_model.wv.init_sims()


In [34]:
start = time.time()
for i, df_chunk in enumerate(np.array_split(ligands_df, NUM_CHUNKS)):
    chunk_start = time.time()
#         if USE_EMBEDDINGS:
#             print("Generating mol2vec embeddings...")
#             embeddings, scores = get_embeddings(ligands_df=df_chunk, model=word2vec_model, radius=1)
#             vectors = sentences2vec(sentences=embeddings, model=word2vec_model, unseen=UNCOMMON)

#             np.save(f"{DATA_DIR}/{RECEPTOR}_embeddings_{i}.npy", vectors)
#             np.save(f"{DATA_DIR}/{RECEPTOR}_embedding_scores_{i}.npy", np.array(scores))
#         else:
    print("Generating Morgan Fingerprints...")
    embeddings, fingerprint_matrix, scores = get_fingerprints(ligands_df=df_chunk)

    if not USE_EMBEDDINGS:
        print("Saving fingerprint matrix...")
        sparse.save_npz(f"{DATA_DIR}/{RECEPTOR}_fingerprints_{i}.npz", fingerprint_matrix)

    np.save(f"{DATA_DIR}/{RECEPTOR}_scores_{i}.npy", np.array(scores))

    print("Saving embeddings...")
    vectors = sentences2vec(sentences=embeddings, model=word2vec_model, unseen=UNCOMMON)
    np.save(f"{DATA_DIR}/{RECEPTOR}_embeddings_{i}.npy", vectors)
    
    print(f"Chunk {i} took: {(time.time() - chunk_start)/60} mins")
    
print(f"Dataset took: {(time.time() - start)/60} mins")

Generating Morgan Fingerprints...


[2m[36m(pid=18246)[0m INFO:root:0
[2m[36m(pid=18238)[0m INFO:root:0
[2m[36m(pid=18247)[0m INFO:root:0
[2m[36m(pid=18242)[0m INFO:root:0
[2m[36m(pid=18249)[0m INFO:root:0
[2m[36m(pid=18250)[0m INFO:root:0
[2m[36m(pid=18237)[0m INFO:root:0
[2m[36m(pid=18240)[0m INFO:root:0
[2m[36m(pid=18245)[0m INFO:root:0
[2m[36m(pid=18244)[0m INFO:root:0
[2m[36m(pid=18248)[0m INFO:root:0
[2m[36m(pid=18243)[0m INFO:root:0
[2m[36m(pid=18241)[0m INFO:root:0
[2m[36m(pid=18239)[0m INFO:root:0
[2m[36m(pid=18236)[0m INFO:root:0
[2m[36m(pid=18235)[0m INFO:root:0
[2m[36m(pid=18236)[0m INFO:root:10000
[2m[36m(pid=18235)[0m INFO:root:10000
[2m[36m(pid=18241)[0m INFO:root:10000
[2m[36m(pid=18244)[0m INFO:root:10000
[2m[36m(pid=18243)[0m INFO:root:10000
[2m[36m(pid=18250)[0m INFO:root:10000
[2m[36m(pid=18239)[0m INFO:root:10000
[2m[36m(pid=18245)[0m INFO:root:10000
[2m[36m(pid=18242)[0m INFO:root:10000
[2m[36m(pid=18240)[0m INFO:root:1000

[2m[36m(pid=18245)[0m INFO:root:120000
[2m[36m(pid=18241)[0m INFO:root:130000
[2m[36m(pid=18239)[0m INFO:root:130000
[2m[36m(pid=18250)[0m INFO:root:120000
[2m[36m(pid=18242)[0m INFO:root:120000
[2m[36m(pid=18243)[0m INFO:root:130000
[2m[36m(pid=18248)[0m INFO:root:130000
[2m[36m(pid=18237)[0m INFO:root:120000
[2m[36m(pid=18249)[0m INFO:root:120000
[2m[36m(pid=18247)[0m INFO:root:120000
[2m[36m(pid=18246)[0m INFO:root:110000
[2m[36m(pid=18244)[0m INFO:root:130000
[2m[36m(pid=18240)[0m INFO:root:130000
[2m[36m(pid=18238)[0m INFO:root:120000
[2m[36m(pid=18245)[0m INFO:root:130000
[2m[36m(pid=18250)[0m INFO:root:130000
[2m[36m(pid=18242)[0m INFO:root:130000
[2m[36m(pid=18237)[0m INFO:root:130000
[2m[36m(pid=18249)[0m INFO:root:130000
[2m[36m(pid=18247)[0m INFO:root:130000
[2m[36m(pid=18246)[0m INFO:root:120000
[2m[36m(pid=18238)[0m INFO:root:130000
[2m[36m(pid=18246)[0m INFO:root:130000


Saving embeddings...
Chunk 0 took: 10.098886346817016 mins
Dataset took: 10.100822548071543 mins
