<a href="https://colab.research.google.com/github/SergeyAnufriev/LSTM/blob/master/LSTM_molecules.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!rm -rf /content/LSTM
!git clone https://github.com/SergeyAnufriev/LSTM.git

Cloning into 'LSTM'...
remote: Enumerating objects: 102, done.[K
remote: Counting objects: 100% (102/102), done.[K
remote: Compressing objects: 100% (76/76), done.[K
remote: Total 102 (delta 52), reused 72 (delta 22), pack-reused 0[K
Receiving objects: 100% (102/102), 13.63 MiB | 19.58 MiB/s, done.
Resolving deltas: 100% (52/52), done.


In [None]:
%%bash
wget -c https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
chmod +x Miniconda3-latest-Linux-x86_64.sh
./Miniconda3-latest-Linux-x86_64.sh -b -f -p /usr/local
conda config --set always_yes yes --set changeps1 no
conda install -q -y -c conda-forge python=3.7
conda install -q -y -c conda-forge rdkit==2020.09.2 

PREFIX=/usr/local
Unpacking payload ...
Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

## Package Plan ##

  environment location: /usr/local

  added / updated specs:
    - _libgcc_mutex==0.1=main
    - brotlipy==0.7.0=py38h27cfd23_1003
    - ca-certificates==2020.10.14=0
    - certifi==2020.6.20=pyhd3eb1b0_3
    - cffi==1.14.3=py38h261ae71_2
    - chardet==3.0.4=py38h06a4308_1003
    - conda-package-handling==1.7.2=py38h03888b9_0
    - conda==4.9.2=py38h06a4308_0
    - cryptography==3.2.1=py38h3c74f83_1
    - idna==2.10=py_0
    - ld_impl_linux-64==2.33.1=h53a641e_7
    - libedit==3.1.20191231=h14c3975_1
    - libffi==3.3=he6710b0_2
    - libgcc-ng==9.1.0=hdf63c60_0
    - libstdcxx-ng==9.1.0=hdf63c60_0
    - ncurses==6.2=he6710b0_1
    - openssl==1.1.1h=h7b6447c_0
    - pip==20.2.4=py38h06a4308_0
    - pycosat==0.6.3=py38h7b6447c_1
    - pycparser==2.20=py_2
    - pyopenssl==19.1.0=pyhd3eb1b0_1
    - pysocks==1.7.1=py3

--2021-07-21 04:01:20--  https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
Resolving repo.continuum.io (repo.continuum.io)... 104.18.200.79, 104.18.201.79, 2606:4700::6812:c94f, ...
Connecting to repo.continuum.io (repo.continuum.io)|104.18.200.79|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh [following]
--2021-07-21 04:01:20--  https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
Resolving repo.anaconda.com (repo.anaconda.com)... 104.16.131.3, 104.16.130.3, 2606:4700::6810:8303, ...
Connecting to repo.anaconda.com (repo.anaconda.com)|104.16.131.3|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 94235922 (90M) [application/x-sh]
Saving to: ‘Miniconda3-latest-Linux-x86_64.sh’

     0K .......... .......... .......... .......... ..........  0% 3.42M 26s
    50K .......... .......... .......... .......... .........

In [None]:
import sys
sys.path.append('/usr/local/lib/python3.7/site-packages/')
sys.path.insert(0,'/content/LSTM')

In [None]:
import torch
import numpy as np
import random
from torch.utils.data import DataLoader,random_split
import matplotlib.pyplot as plt
import time
from torch.nn.functional import softmax

'''Custom modules'''
from data_ import Dataset_
from model_ import RNN_forward
from utils_ import loss_,test_loss,sample_,model_quality


In [None]:
'''Model parameters'''
EMBED_DIM  = 100
batch_size = 64
n_hidden   = 512
n_layers   = 2
drop1      = 0.3
drop2      = 0.5
LR         = 10e-3

In [None]:
'''Set the random seeds for deterministic results'''
SEED = 1234
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)
    torch.backends.cudnn.deterministic = True

In [None]:
device          = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
dir_dataset     = '/content/LSTM/qm9_smiles.txt'
dataset         = Dataset_(dir_dataset,device)
train_size      = int(len(dataset)*0.8)
test_size       = len(dataset) - train_size
print('Train size ={}, Test size ={}'.format(train_size,test_size))

train_, test_   = random_split(dataset,[train_size,test_size],generator=torch.Generator().manual_seed(SEED))
train_l, test_l = DataLoader(train_,batch_size=batch_size,drop_last=True),DataLoader(test_,batch_size=batch_size,drop_last=True)

Train size =496389, Test size =124098


In [None]:
model = RNN_forward(input_dim=len(dataset.dict_)+1,emb_dim=EMBED_DIM,hid_dim=n_hidden,\
                    n_layers=n_layers,layer_norm=False,dropout=drop1)
model.to(device)
opt   = torch.optim.Adam(model.parameters(),lr=LR)

hidden = model.init_hidden_(batch_size,device)

In [None]:
def sample_(model,n_molecules,temperature,dataset,device,seq_len=100):
    '''Input:
        1) y_t unnormilised logits for the next symbols
        2) temperature - sampling temperature
        Return:
            next token
        https://pytorch-nlp-tutorial-ny2018.readthedocs.io/en/latest/day2/sampling.html'''
    model.eval()
    hidden = model.init_hidden_(n_molecules, device)
    x      = torch.tensor([dataset.dict_['G']]*n_molecules, dtype=torch.long, device=device).unsqueeze(0).permute(1,0)
    seq    = torch.ones((n_molecules,1), device=device)

    for i in range(seq_len):
        logits, hidden = model(x,hidden)
        prob           = softmax(logits / temperature, dim=-1)
        x              = torch.multinomial(prob, 1)
        seq            = torch.hstack([seq,x])

    matrix      =  np.vectorize(dataset.dict_inv.get)(seq.detach().cpu().numpy()[:,1:])
    smiles_list = []
    for i in range(n_molecules):
        string_ = ''.join(list(matrix[i, :]))
        smiles_list.append(string_.split('E')[0])

    return smiles_list

In [None]:
f = open('/content/LSTM/data/qm9_smiles.txt','r')
data_mols = [line.split(',')[0][:-1] for line in f.readlines()]

In [None]:
before_train_sample  = sample_(model,1000,1,dataset,device)
valid, unique, novel = model_quality(before_train_sample,data_mols)

In [None]:
print('Before Trauin Valid={}, Unique={}, Novel={}'.format(valid, unique, novel))

Before Trauin Valid=0.041, Unique=0.005, Novel=0.005


In [None]:
with open('before_train.txt', 'w') as filehandle:
    for listitem in sample_(model,100,1,dataset,device):
        filehandle.write('%s\n' % listitem)

In [None]:
for j in range(10):
  for i,(input_seq,target_seq,mask) in enumerate(train_l):
    
    model.train()
    opt.zero_grad()
    pred,_= model(input_seq,hidden)
    l        = loss_(pred,target_seq,mask)
    l.backward()
    opt.step()
    
  #print('Epoch ={},Train loss = {}, Test loss ={}'.format(j,l,test_loss(test_l,model,batch_size,device)))

In [None]:
torch.save(model, '/content/normed_model.pth')

In [None]:
after_train_sample  = sample_(model,1000,1,dataset,device)
valid, unique, novel = model_quality(after_train_sample,data_mols)

In [None]:
print('After train Valid={}, Unique={}, Novel={}'.format(valid, unique, novel))

After train Valid=0.948, Unique=0.944, Novel=0.381


In [None]:
after_train_sample  = sample_(model,1000,1,dataset,device)
valid, unique, novel = model_quality(after_train_sample,data_mols)

In [None]:
print('After train Valid={}, Unique={}, Novel={}'.format(valid, unique, novel))

After train Valid=0.624, Unique=0.621, Novel=0.471


In [None]:
import json

In [None]:
with open('dict_normed.json', 'w') as fp:
    json.dump(dataset.dict_, fp)

In [None]:
with open('inv_dict_normed.json', 'w') as fp:
    json.dump(dataset.dict_inv, fp)

In [None]:
with open('after_train.txt', 'w') as filehandle:
    for listitem in sample_(model,100,1,dataset,device):
        filehandle.write('%s\n' % listitem)