In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [2]:
import sys
import os
import requests
import subprocess
import shutil
from logging import getLogger, StreamHandler, INFO


logger = getLogger(__name__)
logger.addHandler(StreamHandler())
logger.setLevel(INFO)


def install(
        chunk_size=4096,
        file_name="Miniconda3-latest-Linux-x86_64.sh",
        url_base="https://repo.continuum.io/miniconda/",
        conda_path=os.path.expanduser(os.path.join("~", "miniconda")),
        rdkit_version=None,
        add_python_path=True,
        force=False):
    """install rdkit from miniconda
    ```
    import rdkit_installer
    rdkit_installer.install()
    ```
    """

    python_path = os.path.join(
        conda_path,
        "lib",
        "python{0}.{1}".format(*sys.version_info),
        "site-packages",
    )

    if add_python_path and python_path not in sys.path:
        logger.info("add {} to PYTHONPATH".format(python_path))
        sys.path.append(python_path)

    if os.path.isdir(os.path.join(python_path, "rdkit")):
        logger.info("rdkit is already installed")
        if not force:
            return

        logger.info("force re-install")

    url = url_base + file_name
    python_version = "{0}.{1}.{2}".format(*sys.version_info)

    logger.info("python version: {}".format(python_version))

    if os.path.isdir(conda_path):
        logger.warning("remove current miniconda")
        shutil.rmtree(conda_path)
    elif os.path.isfile(conda_path):
        logger.warning("remove {}".format(conda_path))
        os.remove(conda_path)

    logger.info('fetching installer from {}'.format(url))
    res = requests.get(url, stream=True)
    res.raise_for_status()
    with open(file_name, 'wb') as f:
        for chunk in res.iter_content(chunk_size):
            f.write(chunk)
    logger.info('done')

    logger.info('installing miniconda to {}'.format(conda_path))
    subprocess.check_call(["bash", file_name, "-b", "-p", conda_path])
    logger.info('done')

    logger.info("installing rdkit")
    subprocess.check_call([
        os.path.join(conda_path, "bin", "conda"),
        "install",
        "--yes",
        "-c", "rdkit",
        "python=={}".format(python_version),
        "rdkit" if rdkit_version is None else "rdkit=={}".format(rdkit_version)])
    logger.info("done")

    import rdkit
    logger.info("rdkit-{} installation finished!".format(rdkit.__version__))


if __name__ == "__main__":
    install()

add /root/miniconda/lib/python3.6/site-packages to PYTHONPATH
python version: 3.6.9
fetching installer from https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
done
installing miniconda to /root/miniconda
done
installing rdkit
done
rdkit-2020.03.1 installation finished!


In [3]:
!pip install git+https://github.com/samoturk/mol2vec;

Collecting git+https://github.com/samoturk/mol2vec
  Cloning https://github.com/samoturk/mol2vec to /tmp/pip-req-build-rsxv5gq1
  Running command git clone -q https://github.com/samoturk/mol2vec /tmp/pip-req-build-rsxv5gq1
Building wheels for collected packages: mol2vec
  Building wheel for mol2vec (setup.py) ... [?25l[?25hdone
  Created wheel for mol2vec: filename=mol2vec-0.1-cp36-none-any.whl size=14026 sha256=604fee6539b72a61094bd38253dde08c8bcd2351da4f21a070a6ce67461639b3
  Stored in directory: /tmp/pip-ephem-wheel-cache-czm2ocon/wheels/96/0f/2d/a1092b9677c96453dc244b209544cac61bc8b974cbffb50063
Successfully built mol2vec
Installing collected packages: mol2vec
Successfully installed mol2vec-0.1


In [0]:
import numpy as np
import pandas as pd

In [0]:
train_data =  pd.read_csv('/content/drive/My Drive/smai_assign4/q3/train.csv')

In [6]:
train_data.head()

Unnamed: 0,SMILES sequence,Binding Affinity
0,CCNC(C)C(NC)c1ccccc1,-18.0861
1,CONC(=O)c1cncnc1,-17.5783
2,CCNC1CCCN(Cc2ccsc2)C1,-20.3645
3,CC(NC(=O)CSCCN)c1ccccc1,-19.3144
4,CCC(CS)CN(C)c1ccccc1,-15.8451


In [7]:
train_aff = train_data['Binding Affinity']
train_data.drop(columns='Binding Affinity',inplace=True)
print(train_aff.shape)
print(train_data.shape)

(9000,)
(9000, 1)


In [0]:
from rdkit import Chem 

In [9]:
train_data['mol'] = train_data['SMILES sequence'].apply(lambda x: Chem.MolFromSmiles(x))
print(type(train_data['mol'][0]))

<class 'rdkit.Chem.rdchem.Mol'>


In [10]:
print(train_data.shape)
print(train_data.head(5))

(9000, 2)
           SMILES sequence                                               mol
0     CCNC(C)C(NC)c1ccccc1  <rdkit.Chem.rdchem.Mol object at 0x7f4b43995e90>
1         CONC(=O)c1cncnc1  <rdkit.Chem.rdchem.Mol object at 0x7f4b43995ee0>
2    CCNC1CCCN(Cc2ccsc2)C1  <rdkit.Chem.rdchem.Mol object at 0x7f4b43995f30>
3  CC(NC(=O)CSCCN)c1ccccc1  <rdkit.Chem.rdchem.Mol object at 0x7f4b43995f80>
4     CCC(CS)CN(C)c1ccccc1  <rdkit.Chem.rdchem.Mol object at 0x7f4b439a3030>


In [11]:
test_data = pd.read_csv('/content/drive/My Drive/smai_assign4/q3/test.csv')
test_data.drop(columns='Binding Affinity',inplace=True)
print(test_data.shape)
print(test_data.head(5));

(2500, 1)
        SMILES sequence
0  Cc1ccc(C2CNCCN2C)cc1
1      CCOC(CO)c1ccccc1
2     CC(=O)Nc1cnn(C)n1
3      CCC(C)NCc1ncccn1
4       CC(C)=C1CC(N)C1


In [12]:
from gensim.models import word2vec
model = word2vec.Word2Vec.load('/content/drive/My Drive/smai_assign4/q3/model_300dim.pkl')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
from mol2vec.features import mol2alt_sentence, mol2sentence, MolSentence, DfVec, sentences2vec
from gensim.models import word2vec

In [14]:
print('Molecular sentence:', mol2alt_sentence(train_data['mol'][1], radius=1))
print('\nMolSentence object:', MolSentence(mol2alt_sentence(train_data['mol'][1], radius=1)))
print('\nDfVec object:',DfVec(sentences2vec(MolSentence(mol2alt_sentence(train_data['mol'][1], radius=1)), model, unseen='UNK')))

Molecular sentence: ['2246728737', '3975275337', '864674487', '903112553', '847961216', '2204949651', '2246699815', '1054767590', '864942730', '1510328189', '3217380708', '2994748777', '3218693969', '3777168895', '2041434490', '3118255683', '3218693969', '725322217', '2041434490', '3118255683', '3218693969', '3777168895']

MolSentence object: MolSentence with 22 words

DfVec object: (22, 300) dimensional vector


In [0]:
train_data['sentence'] = train_data.apply(lambda x: MolSentence(mol2alt_sentence(x['mol'], 1)), axis=1)

In [16]:
train_data['mol2vec'] = [DfVec(x) for x in sentences2vec(train_data['sentence'], model, unseen='UNK')]
X = np.array([x.vec for x in train_data['mol2vec']])
y = train_aff.values

print(X.shape)
print(len(y))

(9000, 300)
9000


In [0]:
from sklearn import svm

In [18]:
test_data['mol'] = test_data['SMILES sequence'].apply(lambda x: Chem.MolFromSmiles(x))
print(type(test_data['mol'][0]))

<class 'rdkit.Chem.rdchem.Mol'>


In [0]:
test_data['sentence'] = test_data.apply(lambda x: MolSentence(mol2alt_sentence(x['mol'], 1)), axis=1)

In [0]:
test_data['mol2vec'] = [DfVec(x) for x in sentences2vec(test_data['sentence'], model, unseen='UNK')]
test = np.array([x.vec for x in test_data['mol2vec']])

In [21]:
clf = svm.SVR(C=55.0,epsilon=2)
clf.fit(X,y)


SVR(C=55.0, cache_size=200, coef0=0.0, degree=3, epsilon=2, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [0]:
y_pred = clf.predict(test)

In [23]:
col=np.array(test_data['SMILES sequence'])
print(col)
print(y_pred)


['Cc1ccc(C2CNCCN2C)cc1' 'CCOC(CO)c1ccccc1' 'CC(=O)Nc1cnn(C)n1' ...
 'NC1=NC(=O)C(=CC(=O)O)S1' 'C=C(CC)CC(C)C(O)c1ccccc1' 'N#CCC(CN)c1cncnc1']
[-21.28885103 -14.35681914 -23.25483643 ... -23.00149889 -15.90468471
 -21.65056617]


In [26]:
y_pred

array([-21.28885103, -14.35681914, -23.25483643, ..., -23.00149889,
       -15.90468471, -21.65056617])