In [37]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tqdm
import ray
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors
from scipy import sparse
import logging
from mol2vec.features import mol2alt_sentence
from gensim.models.word2vec import Word2Vec

In [18]:
ray.init()

2021-05-26 22:38:18,136	INFO services.py:1267 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8266[39m[22m


{'node_ip_address': '172.31.34.223',
 'raylet_ip_address': '172.31.34.223',
 'redis_address': '172.31.34.223:20179',
 'object_store_address': '/tmp/ray/session_2021-05-26_22-38-15_934524_25070/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2021-05-26_22-38-15_934524_25070/sockets/raylet',
 'webui_url': '127.0.0.1:8266',
 'session_dir': '/tmp/ray/session_2021-05-26_22-38-15_934524_25070',
 'metrics_export_port': 54370,
 'node_id': 'e0e172425193445f1a039e5786930123a8924d351c32ba927c5f8df1'}

In [4]:
RECEPTOR = "EnamineHTS"
INPUT_DATA_DIR = "/mnt/efs/enamine"
INPUT_DATA = f"{INPUT_DATA_DIR}/{RECEPTOR}_scores.csv"
# INPUT_SMILES = f"{INPUT_DATA_DIR}/{RECEPTOR}.smi"
# OUTPUT_EMBEDDINGS = f"{INPUT_DATA_DIR}/{RECEPTOR}_embeddings.csv"
OUTPUT_DATA_DIR = "../processed_data"

In [5]:
def get_data():
    ligands_df = pd.read_csv(INPUT_DATA)
    
    return ligands_df

In [19]:
word2vec_model = Word2Vec.load("/home/ubuntu/mol2vec/examples/models/model_300dim.pkl")
word2vec_model.wv.init_sims()

  word2vec_model.wv.init_sims()


In [23]:
UNCOMMON = "UNK"
# word2vec_model.wv[UNCOMMON]

# Embed molecules

In [25]:
# mol2vec featurize -i "/mnt/efs/enamine/EnamineHTS.smi" -o "/mnt/efs/enamine/EnamineHTS_embeddings.csv" -m examples/models/model_300dim.pkl -r 1 --uncommon UNK
# features.featurize(in_file=INPUT_SMILES, 
#                    out_file=OUTPUT_EMBEDDINGS, 
#                    model_path="/home/ubuntu/mol2vec/examples/models/model_300dim.pkl", 
#                    r=1, 
#                    uncommon="UNK")

In [41]:
def sentences2vec(sentences, model, unseen=None):
    """Generate vectors for each sentence (list) in a list of sentences. Vector is simply a
    sum of vectors for individual words.
    
    Parameters
    ----------
    sentences : list, array
        List with sentences
    model : word2vec.Word2Vec
        Gensim word2vec model
    unseen : None, str
        Keyword for unseen words. If None, those words are skipped.
        https://stats.stackexchange.com/questions/163005/how-to-set-the-dictionary-for-text-analysis-using-neural-networks/163032#163032

    Returns
    -------
    np.array
    """
    keys = set(model.wv.key_to_index)
    vec = []
    
    if unseen:
        unseen_vec = model.wv.get_vector(unseen)

    for sentence in sentences:
        if unseen:
            vec.append(sum([model.wv.get_vector(y) if y in set(sentence) & keys
                       else unseen_vec for y in sentence]))
        else:
            vec.append(sum([model.wv.get_vector(y) for y in sentence 
                            if y in set(sentence) & keys]))
    return np.array(vec, dtype=np.float32)

In [33]:
@ray.remote
def create_mol_sentence(smiles, score, r, i):
    if i % 10000 == 0:
        logging.basicConfig(level=logging.INFO)
        logging.info(i)
        
    mol = Chem.MolFromSmiles(smiles)
    # smiles = Chem.MolToSmiles(mol)
    
    if not mol:
        return
    
    alt_sentence = mol2alt_sentence(mol, radius=r)
    
    return alt_sentence, score

In [34]:
def get_embeddings(ligands_df, model, radius=1):
    future_values = [create_mol_sentence.remote(smiles=smiles, score=score, r=radius, i=i) for (i, (smiles, score)) in enumerate(zip(ligands_df["smiles"], ligands_df["score"]))]
    
    values = [v for v in ray.get(future_values) if v]
    mol_sentences, scores = zip(*values)

#     vectors = sentences2vec(sentences=mol_sentences, model=model, unseen=UNCOMMON)

    return mol_sentences, scores

In [35]:
ligands_df = get_data()

In [36]:
embeddings, scores = get_embeddings(ligands_df=ligands_df, model=word2vec_model, radius=1)

[2m[36m(pid=25786)[0m INFO:root:0
[2m[36m(pid=25785)[0m INFO:root:10000
[2m[36m(pid=25785)[0m INFO:root:20000
[2m[36m(pid=25777)[0m INFO:root:30000
[2m[36m(pid=25776)[0m INFO:root:40000
[2m[36m(pid=25781)[0m INFO:root:50000
[2m[36m(pid=25784)[0m INFO:root:60000
[2m[36m(pid=25780)[0m INFO:root:70000
[2m[36m(pid=25776)[0m INFO:root:80000
[2m[36m(pid=25787)[0m INFO:root:90000
[2m[36m(pid=25783)[0m INFO:root:100000
[2m[36m(pid=25775)[0m INFO:root:110000
[2m[36m(pid=25785)[0m INFO:root:120000
[2m[36m(pid=25784)[0m INFO:root:130000
[2m[36m(pid=25784)[0m INFO:root:140000
[2m[36m(pid=25786)[0m INFO:root:150000
[2m[36m(pid=25779)[0m INFO:root:160000
[2m[36m(pid=25781)[0m INFO:root:170000
[2m[36m(pid=25781)[0m INFO:root:180000
[2m[36m(pid=25783)[0m INFO:root:190000
[2m[36m(pid=25780)[0m INFO:root:200000
[2m[36m(pid=25778)[0m INFO:root:210000
[2m[36m(pid=25784)[0m INFO:root:220000
[2m[36m(pid=25778)[0m INFO:root:230000
[2m[

[2m[36m(pid=25780)[0m INFO:root:1940000
[2m[36m(pid=25775)[0m INFO:root:1950000
[2m[36m(pid=25789)[0m INFO:root:1960000
[2m[36m(pid=25780)[0m INFO:root:1970000
[2m[36m(pid=25788)[0m INFO:root:1980000
[2m[36m(pid=25785)[0m INFO:root:1990000
[2m[36m(pid=25778)[0m INFO:root:2000000
[2m[36m(pid=25782)[0m INFO:root:2010000
[2m[36m(pid=25787)[0m INFO:root:2020000
[2m[36m(pid=25781)[0m INFO:root:2030000
[2m[36m(pid=25783)[0m INFO:root:2040000
[2m[36m(pid=25787)[0m INFO:root:2050000
[2m[36m(pid=25777)[0m INFO:root:2060000
[2m[36m(pid=25784)[0m INFO:root:2070000
[2m[36m(pid=25776)[0m INFO:root:2080000
[2m[36m(pid=25789)[0m INFO:root:2090000
[2m[36m(pid=25774)[0m INFO:root:2100000


In [39]:
vectors = sentences2vec(sentences=embeddings, model=word2vec_model, unseen=UNCOMMON)

In [42]:
np.save(f"{INPUT_DATA_DIR}/{RECEPTOR}_embeddings.npy", vectors)
np.save(f"{INPUT_DATA_DIR}/{RECEPTOR}_embedding_scores.npy", np.array(scores))