## Notebook otimizado para ser executado na plataforma https://colab.research.google.com/notebooks/intro.ipynb#recent=true

# Instalações

In [None]:
!apt-get update
!apt-get install python3.7

In [None]:
!which python # should return /usr/local/bin/python

In [None]:
!python --version

In [None]:
!echo $PYTHONPATH

In [None]:
%env PYTHONPATH=

In [None]:
%%bash
MINICONDA_INSTALLER_SCRIPT=Miniconda3-4.5.12-Linux-x86_64.sh
MINICONDA_PREFIX=/usr/local
wget https://repo.continuum.io/miniconda/$MINICONDA_INSTALLER_SCRIPT
chmod +x $MINICONDA_INSTALLER_SCRIPT
./$MINICONDA_INSTALLER_SCRIPT -b -f -p $MINICONDA_PREFIX

In [None]:
!which conda # should return /usr/local/bin/conda

In [None]:
!conda --version # should return 4.5.4

In [None]:
!which python # still returns /usr/local/bin/python

In [None]:
!python --version # now returns Python 3.6.5 :: Anaconda, Inc.

In [None]:
%%bash
conda install --channel defaults conda python=3.7 --yes
conda update --channel defaults --all --yes

In [None]:
!conda --version # now returns 4.8.3

In [None]:
!python --version # now returns Python 3.6.10 :: Anaconda, Inc.


In [None]:
import sys
sys.path
['',  
 '/env/python',
 '/usr/lib/python37.zip',
 '/usr/lib/python3.7',
 '/usr/lib/python3.7/lib-dynload',
 '/usr/local/lib/python3.7/dist-packages', # pre-installed packages
 '/usr/lib/python3/dist-packages',
 '/usr/local/lib/python3.7/dist-packages/IPython/extensions',
 '/root/.ipython']

In [None]:
!ls /usr/local/lib/python3.7/dist-packages


In [16]:
import sys
_ = (sys.path
        .append("/usr/local/lib/python3.7/site-packages"))

In [None]:
!conda install --channel conda-forge featuretools --yes

In [None]:
!conda install -c rdkit rdkit

!conda install -c conda-forge scikit-learn 
!conda install -c conda-forge matplotlib

##Deep Learning part
##install keras
!conda install -c conda-forge keras

# Introdução

In [19]:
from rdkit import Chem
from rdkit import rdBase
rdBase.DisableLog('rdApp.error')

from IPython.display import SVG
from  IPython.display import Image as displayImage
import io
from PIL import Image
import numpy as np
import pandas as pd
import random

random.seed(1)
np.random.seed(1)

SMILES is a notation language for representing molecules as sequences of characters.

In [20]:
SMILES = "ClCCCO"
SMILES

'ClCCCO'

canonização de um smiles. Canonicalization is a process for converting data that has more than one possible representation into a "standard", "normal", or canonical form. 

In [21]:
Chem.MolFromSmiles( "ClCCCO" )

<rdkit.Chem.rdchem.Mol at 0x7efc51c67670>

### Converter smiles para um objeto mol

In [22]:
mol = Chem.MolFromSmiles( "ClCCCO" )


### Converter o objeto mol de volta para um smile

In [23]:
Chem.MolToSmiles( mol, canonical=True )

'OCCCCl'

# Propriedades
É preciso converter o smiles para o objeto mol. Uma vez que as funções sobre as propriedades trabalham sobre o objeto mol

In [24]:
from rdkit.Chem import Descriptors

### Peso molecular

In [25]:
molWeight = Descriptors.ExactMolWt( mol )
print("Molecular Weight:\t", molWeight)

Molecular Weight:	 94.018542524


### Solubilidade

In [26]:
logP = Descriptors.MolLogP( mol )
print("Solubility:\t\t", logP)

Solubility:		 0.6075999999999999


### Area de superficies polar

In [27]:
tpsa = Descriptors.TPSA( mol )
print("Polar Surface Area:\t", tpsa)

Polar Surface Area:	 20.23


# Manipulações no dataset train.csv - Exemplo

## Carregar o dataset e aproveitar apenas a coluna SMILES - Exemplo

Necessário carregar o dataset "train.csv" para o interior da pasta datasets

In [33]:
molecules = pd.read_csv("data/train.csv")
del molecules["SPLIT"]

In [34]:
molecules.head()

Unnamed: 0,SMILES
0,CCCS(=O)c1ccc2[nH]c(=NC(=O)OC)[nH]c2c1
1,CC(C)(C)C(=O)C(Oc1ccc(Cl)cc1)n1ccnc1
2,Cc1c(Cl)cccc1Nc1ncccc1C(=O)OCC(O)CO
3,Cn1cnc2c1c(=O)n(CC(O)CO)c(=O)n2C
4,CC1Oc2ccc(Cl)cc2N(CC(O)CO)C1=O


In [35]:
molecules.size

1584663

Tem 1.6 milhoes de moleculas. Reduzir o dataset devido a problemas de RAM

### Redução do dataset

In [36]:
molecules_reduce = molecules.sample(5000)

In [37]:
molecules_reduce.size

5000

### Para cada SMILES adicionar o correspondente objeto mol


In [38]:
molecules_reduce['mols'] = molecules_reduce['SMILES'].apply(lambda x: Chem.MolFromSmiles(x))

In [39]:
molecules_reduce.head()

Unnamed: 0,SMILES,mols
86890,Cc1cccn2cc(CCNS(=O)(=O)c3cccc(F)c3)nc12,<rdkit.Chem.rdchem.Mol object at 0x7efc519a7490>
947484,CC(c1ccc(NC(=O)NCCC2CCCO2)cc1)N1CCOCC1,<rdkit.Chem.rdchem.Mol object at 0x7efc519a7ad0>
923280,O=C1CCC(C(=O)N2CCCC2c2nc3ccccc3s2)N1,<rdkit.Chem.rdchem.Mol object at 0x7efc519a7530>
1550856,Cn1cccc1C1COCCN1C(=O)c1ccc2c(c1)nnn2C,<rdkit.Chem.rdchem.Mol object at 0x7efc519b7da0>
1144022,CCCNc1nnc(-c2c3c(nn(C)c2=O)CCC3)s1,<rdkit.Chem.rdchem.Mol object at 0x7efc519b7a30>


### Para cada SMILES adicionar o correspondente peso molecular

In [40]:
molecules_reduce['molWt'] = molecules_reduce['mols'].apply(lambda x: Descriptors.MolLogP(x))

In [41]:
molecules_reduce.head()

Unnamed: 0,SMILES,mols,molWt
86890,Cc1cccn2cc(CCNS(=O)(=O)c3cccc(F)c3)nc12,<rdkit.Chem.rdchem.Mol object at 0x7efc519a7490>,2.30282
947484,CC(c1ccc(NC(=O)NCCC2CCCO2)cc1)N1CCOCC1,<rdkit.Chem.rdchem.Mol object at 0x7efc519a7ad0>,2.7704
923280,O=C1CCC(C(=O)N2CCCC2c2nc3ccccc3s2)N1,<rdkit.Chem.rdchem.Mol object at 0x7efc519a7530>,2.2384
1550856,Cn1cccc1C1COCCN1C(=O)c1ccc2c(c1)nnn2C,<rdkit.Chem.rdchem.Mol object at 0x7efc519b7da0>,1.5205
1144022,CCCNc1nnc(-c2c3c(nn(C)c2=O)CCC3)s1,<rdkit.Chem.rdchem.Mol object at 0x7efc519b7a30>,1.6094


### Para cada SMILES adicionar a correspondente Solubilidade



In [42]:
molecules_reduce['Solubility'] = molecules_reduce['mols'].apply(lambda x: Descriptors.MolLogP(x))

In [43]:
molecules_reduce.head()

Unnamed: 0,SMILES,mols,molWt,Solubility
86890,Cc1cccn2cc(CCNS(=O)(=O)c3cccc(F)c3)nc12,<rdkit.Chem.rdchem.Mol object at 0x7efc519a7490>,2.30282,2.30282
947484,CC(c1ccc(NC(=O)NCCC2CCCO2)cc1)N1CCOCC1,<rdkit.Chem.rdchem.Mol object at 0x7efc519a7ad0>,2.7704,2.7704
923280,O=C1CCC(C(=O)N2CCCC2c2nc3ccccc3s2)N1,<rdkit.Chem.rdchem.Mol object at 0x7efc519a7530>,2.2384,2.2384
1550856,Cn1cccc1C1COCCN1C(=O)c1ccc2c(c1)nnn2C,<rdkit.Chem.rdchem.Mol object at 0x7efc519b7da0>,1.5205,1.5205
1144022,CCCNc1nnc(-c2c3c(nn(C)c2=O)CCC3)s1,<rdkit.Chem.rdchem.Mol object at 0x7efc519b7a30>,1.6094,1.6094


### Para cada SMILES adicionar a correspondente Area de superficies polar

In [44]:
molecules_reduce['PolarSurfaceArea'] = molecules_reduce['mols'].apply(lambda x: Descriptors.TPSA(x))

In [45]:
molecules_reduce.head()

Unnamed: 0,SMILES,mols,molWt,Solubility,PolarSurfaceArea
86890,Cc1cccn2cc(CCNS(=O)(=O)c3cccc(F)c3)nc12,<rdkit.Chem.rdchem.Mol object at 0x7efc519a7490>,2.30282,2.30282,63.47
947484,CC(c1ccc(NC(=O)NCCC2CCCO2)cc1)N1CCOCC1,<rdkit.Chem.rdchem.Mol object at 0x7efc519a7ad0>,2.7704,2.7704,62.83
923280,O=C1CCC(C(=O)N2CCCC2c2nc3ccccc3s2)N1,<rdkit.Chem.rdchem.Mol object at 0x7efc519a7530>,2.2384,2.2384,62.3
1550856,Cn1cccc1C1COCCN1C(=O)c1ccc2c(c1)nnn2C,<rdkit.Chem.rdchem.Mol object at 0x7efc519b7da0>,1.5205,1.5205,65.18
1144022,CCCNc1nnc(-c2c3c(nn(C)c2=O)CCC3)s1,<rdkit.Chem.rdchem.Mol object at 0x7efc519b7a30>,1.6094,1.6094,72.7


### dataset reduzido com todas as propriedades

In [46]:
molecules_reduce.head()

Unnamed: 0,SMILES,mols,molWt,Solubility,PolarSurfaceArea
86890,Cc1cccn2cc(CCNS(=O)(=O)c3cccc(F)c3)nc12,<rdkit.Chem.rdchem.Mol object at 0x7efc519a7490>,2.30282,2.30282,63.47
947484,CC(c1ccc(NC(=O)NCCC2CCCO2)cc1)N1CCOCC1,<rdkit.Chem.rdchem.Mol object at 0x7efc519a7ad0>,2.7704,2.7704,62.83
923280,O=C1CCC(C(=O)N2CCCC2c2nc3ccccc3s2)N1,<rdkit.Chem.rdchem.Mol object at 0x7efc519a7530>,2.2384,2.2384,62.3
1550856,Cn1cccc1C1COCCN1C(=O)c1ccc2c(c1)nnn2C,<rdkit.Chem.rdchem.Mol object at 0x7efc519b7da0>,1.5205,1.5205,65.18
1144022,CCCNc1nnc(-c2c3c(nn(C)c2=O)CCC3)s1,<rdkit.Chem.rdchem.Mol object at 0x7efc519b7a30>,1.6094,1.6094,72.7


Neste último dataset não foi possível calcular as propriedades das moléculas para todos os exemplos de treino, uma vez que o carregamento de mais de 1 milhão de moléculas comprometia a memória RAM. Na próxima secção encontra-se o código que consegue resolver o problema da memória RAM.

# Construção do dataset train_final.csv com todos os dados do dataset de treino e com as conlunas referentes às propriedades dos SMILES supracitadas

Construção de um dataframe cujas colunas são as propriedades das moléculas: peso molecular, solubilidade e área polar

In [47]:
molecules_final1 = pd.DataFrame(columns=['SMILES','molWt','Solubility','PolarSurfaceArea']) 
molecules_final1
molecules_final1.to_csv("data/train_final.csv")

Contrução do ficheiro train_final.csv, cujos exemplos são os SMILES dos dados de treino e as colunas o calculo das respetivas propriedades

In [48]:
chunksize = 10000
filename = "data/train.csv"
for chunk in pd.read_csv(filename, chunksize=chunksize):
    del chunk["SPLIT"]
    chunk['mols'] = chunk['SMILES'].apply(lambda x: Chem.MolFromSmiles(x))
    chunk['molWt'] = chunk['mols'].apply(lambda x: Descriptors.MolLogP(x))
    chunk['Solubility'] = chunk['mols'].apply(lambda x: Descriptors.MolLogP(x))
    chunk['PolarSurfaceArea'] = chunk['mols'].apply(lambda x: Descriptors.TPSA(x))
    del chunk["mols"]
    chunk.to_csv("data/train_final.csv", mode='a', header=False)

Verificação do número de linhas dos datasets. Se no processo não houve erros, número de exemplos são iguais

In [51]:
pd.read_csv("data/train_final.csv")["SMILES"].size

1584663

In [52]:
pd.read_csv("data/train.csv")["SMILES"].size

1584663

Dataset resultante pode ser obtido com o seguinte código:

In [None]:
!pip install gdown
!gdown 'https://drive.google.com/file/d/1wlnBI7-zyT21ib0ui44XwOYwNh7pkexi/view?usp=sharing'