# Training program for Experiment

## setting path

In [2]:
import sys
import os

sys.path.append(os.path.split(os.getcwd())[0])


# Vocab,datasetのロード

In [3]:
from fast_jtnn import *
from MS_PredictModel import MS_Dataset,MS_Dataset_pickle,dataset_load
import pickle
import torch

VOCAB_FILE = "./MS_vocab.txt"

vocab = [x.strip("\r\n ") for x in open(VOCAB_FILE,"r")]
vocab = Vocab(vocab)

'''
MS_Dataset.QUERY = """select smiles,file_path from massbank where ms_type="MS" and instrument_type="EI-B" and smiles<>'N/A';"""
dataset = MS_Dataset(vocab=vocab,host="localhost",database="chemoinfo",batch_size=20)
'''
train_vali_rate = 0.9

train_dataset, vali_dataset = dataset_load("./massbank.pkl",vocab,20,train_vali_rate)
print("number of train dataset :",len(train_dataset))
print("number of validation dataset :",len(vali_dataset))

('number of train dataset :', 6584)
('number of validation dataset :', 732)


## モデルの作成

In [4]:
from ms_encoder import ms_peak_encoder,ms_peak_encoder_lstm,ms_peak_encoder_cnn
import torch.nn as nn
import torch
hidden_size = 100
latent_size = 56
depthT = 20
depthG = 3

dec_model = JTNNVAE(vocab, hidden_size, latent_size, depthT, depthG).to('cuda')
print dec_model
#enc_model = ms_peak_encoder_lstm(train_dataset.max_spectrum_size,output_size=latent_size,\
#        hidden_size=100,embedding_size=5,num_rnn_layers=2,bidirectional=True,dropout_rate=0.5).to('cuda')
enc_model = ms_peak_encoder_cnn(train_dataset.max_spectrum_size,output_size=latent_size,\
                                 hidden_size=100,embedding_size=10,num_rnn_layers=2,bidirectional=True,dropout_rate=0.5).to('cuda')
print enc_model

for param in dec_model.parameters():
    if param.dim() == 1:
        nn.init.constant_(param, 0)
    else:
        nn.init.xavier_normal_(param)
load_model = "./vae_model/model.iter-50000"
dec_model.load_state_dict(torch.load(load_model,map_location='cuda'))

print "Model #Params: %dK" % (sum([x.nelement() for x in dec_model.parameters()]) / 1000,)
print "Model #Params: %dK" % (sum([x.nelement() for x in enc_model.parameters()]) / 1000,)



JTNNVAE(
  (jtnn): JTNNEncoder(
    (embedding): Embedding(1027, 100)
    (outputNN): Sequential(
      (0): Linear(in_features=200, out_features=100, bias=True)
      (1): ReLU()
    )
    (GRU): GraphGRU(
      (W_z): Linear(in_features=200, out_features=100, bias=True)
      (W_r): Linear(in_features=100, out_features=100, bias=False)
      (U_r): Linear(in_features=100, out_features=100, bias=True)
      (W_h): Linear(in_features=200, out_features=100, bias=True)
    )
  )
  (decoder): JTNNDecoder(
    (embedding): Embedding(1027, 100)
    (W_z): Linear(in_features=200, out_features=100, bias=True)
    (U_r): Linear(in_features=100, out_features=100, bias=False)
    (W_r): Linear(in_features=100, out_features=100, bias=True)
    (W_h): Linear(in_features=200, out_features=100, bias=True)
    (W): Linear(in_features=128, out_features=100, bias=True)
    (U): Linear(in_features=128, out_features=100, bias=True)
    (U_i): Linear(in_features=200, out_features=100, bias=True)
    (W_o)

## setting optimizer

In [5]:
import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler

optimizer = optim.Adam(enc_model.parameters(), lr=1e-3)
#optimizer = optim.SGD(enc_model.parameters(),lr=100)
scheduler = lr_scheduler.ExponentialLR(optimizer, 0.9)
#scheduler.step()

In [None]:
from MS_PredictModel import ms_peak_encoder,MS_Dataset
from torch.autograd import Variable
from tqdm import tqdm
import numpy as np

pbar = None
train_dataset.batch_size = 20
vali_dataset.batch_size = 10

anneal_iter = 7400

beta = 0
step_beta = 0.02
kl_anneal_iter = 10000
max_beta = 1.0
warmup = 20000

def training(max_epoch = 100):
    global pbar
    global beta
    total_step = 0
    meters = np.zeros(4)
    vali_meters = np.zeros(3)
    with open("log2.csv","w") as f:
        f.write("epoch,iter.,word,topo,assm,vali word,vali topo,vali assm\n")
    for epoch in range(max_epoch):
        print("epoch : ",epoch)
        for batch in train_dataset:
            x_batch, x_jtenc_holder, x_mpn_holder, x_jtmpn_holder,x,y = batch
            total_step+=1
            #pbar.update(1)
            x = x.to('cuda')
            y = y.to('cuda')
            
            enc_model.zero_grad()
            dec_model.zero_grad()
            optimizer.zero_grad()
            
            h,kl_loss = enc_model(x,y,training=True,sample=True)
            tree_vec = h[:,:h.shape[1]/2]
            mol_vec  = h[:,h.shape[1]/2:]
            _, x_tree_mess = dec_model.jtnn(*x_jtenc_holder)
            word_loss, topo_loss, word_acc, topo_acc = dec_model.decoder(x_batch,tree_vec)
            assm_loss, assm_acc = dec_model.assm(x_batch, x_jtmpn_holder, mol_vec , x_tree_mess)
            total_loss = word_loss+topo_loss+assm_loss+beta*kl_loss
            total_loss.backward()
            optimizer.step()
            del x,y,h
            
            meters = meters + np.array([kl_loss.item(),word_acc * 100, topo_acc * 100, assm_acc * 100])
            if total_step % 200 == 0:
                vali_total = 0
                for batch in vali_dataset:
                    x_batch, x_jtenc_holder, x_mpn_holder, x_jtmpn_holder,x,y = batch
                    x = x.to('cuda')
                    y = y.to('cuda')
                    with torch.no_grad():
                        h,_ = enc_model(x,y,training=False,sample=False)
                        tree_vec = h[:,:h.shape[1]/2]
                        mol_vec  = h[:,h.shape[1]/2:]
                        _, x_tree_mess = dec_model.jtnn(*x_jtenc_holder)
                        word_loss, topo_loss, word_acc, topo_acc = dec_model.decoder(x_batch,tree_vec)
                        assm_loss, assm_acc = dec_model.assm(x_batch, x_jtmpn_holder, mol_vec , x_tree_mess)
                        vali_meters = vali_meters + np.array([word_acc * 100, topo_acc * 100, assm_acc * 100])
                        vali_total += 1    
                    del x,y,h
                    
                meters /= 200
                vali_meters /= vali_total
                print "[%d] , kl_loss %.2f, Word: %.2f, Topo: %.2f, Assm: %.2f vali_Word: %.2f, vali_Topo: %.2f, vali_assm: %.2f, learning rate: %.4f" % \
                    (total_step, meters[0], meters[1], meters[2],meters[3], vali_meters[0],vali_meters[1],vali_meters[2],scheduler.get_lr()[0])
                with open("log2.csv","a") as f:
                    f.write("%d,%d,%.2f,%.2f,%.2f,%.2f,%.2f,%.2f\n" % (epoch,total_step,meters[0], meters[1], meters[2],vali_meters[0],vali_meters[1],vali_meters[2]))
                sys.stdout.flush()
                meters *= 0
                vali_meters *= 0
            if total_step % 200 == 0:
                torch.save(enc_model.state_dict(), "./enc_model" + "/model.iter-" + str(total_step))
            #if total_step % anneal_iter == 0:
                #scheduler.step()
                
            if total_step % kl_anneal_iter == 0 and total_step >= warmup:
                beta = min(max_beta, beta + step_beta)

#import pdb; pdb.set_trace()
try:
    #if pbar is None:
        #pbar = tqdm()
    if not os.path.exists("./enc_model"):
        os.mkdir("./enc_model")
    training(500)
except RuntimeError as e:
    #if pbar is not None:
        #del pbar
    import traceback
    print(traceback.format_exc())
    #import pdb; pdb.set_trace()
    print(e)




('epoch : ', 0)




[200] , kl_loss 311.75, Word: 35.80, Topo: 85.28, Assm: 82.59 vali_Word: 33.66, vali_Topo: 85.42, vali_assm: 85.39, learning rate: 0.0010
('epoch : ', 1)
[400] , kl_loss 565.33, Word: 35.03, Topo: 85.93, Assm: 85.07 vali_Word: 34.72, vali_Topo: 85.19, vali_assm: 84.71, learning rate: 0.0010
[600] , kl_loss 717.27, Word: 36.48, Topo: 86.26, Assm: 85.52 vali_Word: 38.79, vali_Topo: 85.85, vali_assm: 84.47, learning rate: 0.0010
('epoch : ', 2)
[800] , kl_loss 767.41, Word: 37.97, Topo: 86.42, Assm: 85.20 vali_Word: 41.30, vali_Topo: 85.88, vali_assm: 84.29, learning rate: 0.0010
('epoch : ', 3)
[1000] , kl_loss 773.03, Word: 40.91, Topo: 86.25, Assm: 85.19 vali_Word: 43.84, vali_Topo: 85.60, vali_assm: 85.08, learning rate: 0.0010
[1200] , kl_loss 748.75, Word: 43.99, Topo: 86.57, Assm: 85.02 vali_Word: 44.63, vali_Topo: 85.91, vali_assm: 82.81, learning rate: 0.0010
('epoch : ', 4)
[1400] , kl_loss 767.15, Word: 45.69, Topo: 86.57, Assm: 85.50 vali_Word: 46.01, vali_Topo: 86.43, vali_as

In [None]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs

model_path = None
if model_path is not None:
    enc_model.load_state_dict(torch.load(model_path,map_location='cuda'))

train_dataset.batch_size = 20
vali_dataset.batch_size = 10

def evaluation():
    ret = []
    with torch.no_grad():
        for batch in vali_dataset:
            x_batch, x_jtenc_holder, x_mpn_holder, x_jtmpn_holder,x,y = batch
            x = x.to('cuda')
            y = y.to('cuda')
            
            h,_ = enc_model(x,y,training=False,sample=False)
            tree_vec = h[:,:h.shape[1]/2]
            mol_vec  = h[:,h.shape[1]/2:]
            for num in range(h.size()[0]):
                
                true_smiles=x_batch[num].smiles
                predict_smiles = dec_model.decode(tree_vec[num].view(1,latent_size/2),mol_vec[num].view(1,latent_size/2),False)
                
                #smilesの正規化
                true_smiles = Chem.MolToSmiles(Chem.MolFromSmiles(true_smiles),True)
                predict_smiles = Chem.MolToSmiles(Chem.MolFromSmiles(predict_smiles),True)
                
                ret.append((true_smiles,predict_smiles))
    return ret
result = evaluation()
print(len(result))

In [None]:
import os
from rdkit import Chem
from rdkit.Chem import Draw
from PIL import ImageDraw,ImageFont
from rdkit.Chem import AllChem
from rdkit import DataStructs

def _re_smiles(smiles1,smiles2):
    #print(smiles1,smiles2)
    smiles1 = Chem.MolToSmiles(Chem.MolFromSmiles(smiles1),True)
    smiles2 = Chem.MolToSmiles(Chem.MolFromSmiles(smiles2),True)
    return smiles1 == smiles2

def is_structural_isomer(smiles1,smiles2):
    def Molecular_formula(smiles):
        atoms = {}
        mol = Chem.AddHs(Chem.MolFromSmiles(smiles))
        for atom in mol.GetAtoms():
            if not atom.GetSymbol() in atoms:
                atoms[atom.GetSymbol()] = 1
            else:
                atoms[atom.GetSymbol()] += 1
        return atoms
    atoms1 = Molecular_formula(smiles1)
    atoms2 = Molecular_formula(smiles2)
    return atoms1 == atoms2
    
def analyze_result(smiles_list,log_path="evaluation.csv",path="image_list"):
    if not os.path.exists(path):
        os.mkdir(path)
        
    with open(log_path,"w") as f:
        print "Number of data: %d"% (len(smiles_list))
        f.write("Number of data,%d\n" % (len(smiles_list)))
        
        true_list = [[i,one[0]] for i,one in enumerate(smiles_list) if _re_smiles(one[0],one[1])]
        print "Number of matching: %d" % (len(true_list))
        f.write("Number of matching: %d\n" % (len(true_list)))
        print(true_list)
    
        true_list = [[i,one[0]] for i,one in enumerate(smiles_list) if is_structural_isomer(one[0],one[1]) and [i,one[0]] not in true_list]
        print "Number of matching: %d" % (len(true_list))
        f.write("Number of matching: %d\n" % (len(true_list)))
        print(true_list)
        
        f.write("true,predict,ECFP-Tanimoto score,MACCS-Tanimoto score\n")
        
    for i,smiles in enumerate(smiles_list):
        true_mol = Chem.MolFromSmiles(smiles[0])
        predict_mol = Chem.MolFromSmiles(smiles[1])
        
        true_fingerprint = AllChem.GetMorganFingerprint(true_mol,2)
        predict_fingerprint = AllChem.GetMorganFingerprint(predict_mol,2)
        ECFP_score = DataStructs.TanimotoSimilarity(true_fingerprint,predict_fingerprint)
        
        true_fingerprint = AllChem.GetMACCSKeysFingerprint(true_mol)
        predict_fingerprint = AllChem.GetMACCSKeysFingerprint(predict_mol)
        MACCS_score = DataStructs.TanimotoSimilarity(true_fingerprint,predict_fingerprint)
        
        with open(log_path,"a") as f:
            f.write(smiles[0]+","+smiles[1]+","+str(ECFP_score)+","+str(MACCS_score)+"\n")
        
        image = Draw.MolsToImage([true_mol,predict_mol])
        draw = ImageDraw.Draw(image)
        font = ImageFont.load_default()
        font.size=40
        draw.text((0,0),str(ECFP_score)+","+str(MACCS_score),(0, 0, 0),font=font)
        image.save(os.path.join(path,"%d.png" % i))
    
analyze_result(result)

In [None]:
!cp -f evaluation.csv ./image_list
!zip -r image.zip ./image_list