# Table of Contents
 <p><div class="lev1 toc-item"><a href="#Arguments" data-toc-modified-id="Arguments-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Arguments</a></div><div class="lev1 toc-item"><a href="#MetaCyc-subset-of-QM9" data-toc-modified-id="MetaCyc-subset-of-QM9-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>MetaCyc subset of QM9</a></div><div class="lev1 toc-item"><a href="#Define-model-and-optimizer" data-toc-modified-id="Define-model-and-optimizer-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Define model and optimizer</a></div><div class="lev1 toc-item"><a href="#Statistics" data-toc-modified-id="Statistics-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Statistics</a></div><div class="lev1 toc-item"><a href="#Create-model" data-toc-modified-id="Create-model-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Create model</a></div><div class="lev1 toc-item"><a href="#get-the-best-checkpoint-if-available-without-training" data-toc-modified-id="get-the-best-checkpoint-if-available-without-training-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>get the best checkpoint if available without training</a></div>

In [1]:
import torch
import torch.optim as optim
import torch.nn as nn
from torch.autograd import Variable

import time
import argparse
import os
import numpy as np
import sys
sys.path.append(os.getcwd())
# Our Modules
import utils
from utils import datasets
from models.MPNN import MPNN
from models.MPNNv2 import MPNNv2
from models.MPNNv3 import MPNNv3
from utils.LogMetric import AverageMeter, Logger


# Arguments

In [2]:
parser = argparse.ArgumentParser(description='Neural message passing')

parser.add_argument('--dataset', default='qm9', help='QM9')
parser.add_argument('--edge-rep', default='raw_distance', choices=['raw_distance','chem_graph','distance_bin'] )
parser.add_argument('--datasetPath', default='./mpnn-data/qm9/dsgdb9nsd/', help='dataset path')
parser.add_argument('--logPath', default='./log/qm9/mpnn/', help='log path')
parser.add_argument('--plotLr', default=False, help='allow plotting the data')
parser.add_argument('--plotPath', default='./plot/qm9/mpnn/', help='plot path')
parser.add_argument('--resume', default='./checkpoint/qm9/mpnn/',
                    help='path to latest checkpoint')
# Optimization Options
parser.add_argument('--batch-size', type=int, default=100, metavar='N',
                    help='Input batch size for training (default: 20)')
parser.add_argument('--no-cuda', action='store_true', default=False,
                    help='Enables CUDA training')
parser.add_argument('--epochs', type=int, default=50, metavar='N',
                    help='Number of epochs to train (default: 50)')
parser.add_argument('--lr', type=lambda x: restricted_float(x, [1e-5, 1e-2]), default=1e-3, metavar='LR',
                    help='Initial learning rate [1e-5, 5e-4] (default: 1e-4)')
parser.add_argument('--lr-decay', type=lambda x: restricted_float(x, [.01, 1]), default=0.6, metavar='LR-DECAY',
                    help='Learning rate decay factor [.01, 1] (default: 0.6)')
parser.add_argument('--schedule', type=list, default=[0.1, 0.9], metavar='S',
                    help='Percentage of epochs to start the learning rate decay [0, 1] (default: [0.1, 0.9])')
parser.add_argument('--momentum', type=float, default=0.9, metavar='M',
                    help='SGD momentum (default: 0.9)')
# i/o
parser.add_argument('--log-interval', type=int, default=20, metavar='N',
                    help='How many batches to wait before logging training status')
# Accelerating
parser.add_argument('--prefetch', type=int, default=2, help='Pre-fetching threads.')

# Model modification
parser.add_argument('--model', type=str,help='MPNN model name [MPNN, MPNNv2, MPNNv3]',
                        default='MPNN')


args = parser.parse_args(['--model','MPNNv2', '--edge-rep','chem_graph', '--no-cuda'])
print(args)


Namespace(batch_size=100, dataset='qm9', datasetPath='./mpnn-data/qm9/dsgdb9nsd/', edge_rep='chem_graph', epochs=50, logPath='./log/qm9/mpnn/', log_interval=20, lr=0.001, lr_decay=0.6, model='MPNNv2', momentum=0.9, no_cuda=True, plotLr=False, plotPath='./plot/qm9/mpnn/', prefetch=2, resume='./checkpoint/qm9/mpnn/', schedule=[0.1, 0.9])


In [3]:
def restricted_float(x, inter):
    x = float(x)
    if x < inter[0] or x > inter[1]:
        raise argparse.ArgumentTypeError("%r not in range [1e-5, 1e-4]"%(x,))
    return x


In [4]:

best_er1 = 0

    # Check if CUDA is enabled
args.cuda = not args.no_cuda and torch.cuda.is_available()

    # Load data
root = args.datasetPath
root

'./mpnn-data/qm9/dsgdb9nsd/'

# MetaCyc subset of QM9

In [5]:
print('Prepare files')
import pandas as pd
qm9_metacyc = pd.read_table('Dataset/Qm9_metacyc.tab')
qm9_metacyc

Prepare files


Unnamed: 0,FrameId,Name,Kegg,InChI,Smiles,Gibbs-0,cpd,nC
0,BUTYRAMIDE,butyramide,NIL,"InChI=1S/C4H9NO/c1-2-3-4(5)6/h2-3H2,1H3,(H2,5,6)",CCCC(N)=O,51.370094,./dsgdb9nsd_000262.xyz,4
1,CPD-282,cyclohex-2-enone,C02395,"InChI=1S/C6H8O/c7-6-4-2-1-3-5-6/h2,4H,1,3,5H2",C1(CCC(C=C1)=O),76.978966,./dsgdb9nsd_002060.xyz,6
2,CPD-7031,3-methylbutanal,C07329,"InChI=1S/C5H10O/c1-5(2)3-4-6/h4-5H,3H2,1-2H3",CC(C)C[CH]=O,71.711205,./dsgdb9nsd_000255.xyz,5
3,CPD-9383,glycine methyl ester,NIL,"InChI=1S/C3H7NO2/c1-6-3(5)2-4/h2,4H2,1H3",C([NH2])C(=O)OC,-3.392161,./dsgdb9nsd_000365.xyz,3
4,CYTOSINE,cytosine,C00380,"InChI=1S/C4H5N3O/c5-3-1-2-6-4(8)7-3/h1-2H,(H3,...",C1(NC(=O)N=C(N)C=1),19.905600,./dsgdb9nsd_004243.xyz,4
5,CYTOSINE,cytosine,C00380,"InChI=1S/C4H5N3O/c5-3-1-2-6-4(8)7-3/h1-2H,(H3,...",C1(NC(=O)N=C(N)C=1),19.905600,./dsgdb9nsd_004318.xyz,4
6,CYTOSINE,cytosine,C00380,"InChI=1S/C4H5N3O/c5-3-1-2-6-4(8)7-3/h1-2H,(H3,...",C1(NC(=O)N=C(N)C=1),19.905600,./dsgdb9nsd_004371.xyz,4
7,CYTOSINE,cytosine,C00380,"InChI=1S/C4H5N3O/c5-3-1-2-6-4(8)7-3/h1-2H,(H3,...",C1(NC(=O)N=C(N)C=1),19.905600,./dsgdb9nsd_004403.xyz,4
8,URACIL,uracil,C00106,"InChI=1S/C4H4N2O2/c7-3-1-2-5-4(8)6-3/h1-2H,(H2...",C1(=CC(NC(=O)N1)=O),-28.995518,./dsgdb9nsd_004258.xyz,4
9,URACIL,uracil,C00106,"InChI=1S/C4H4N2O2/c7-3-1-2-5-4(8)6-3/h1-2H,(H2...",C1(=CC(NC(=O)N1)=O),-28.995518,./dsgdb9nsd_004294.xyz,4


In [6]:
files = [f for f in os.listdir(root) if os.path.isfile(os.path.join(root, f))
                                        and './' + os.path.basename(f) in 
                                             set(qm9_metacyc['cpd'].values)]

idx = np.random.permutation(len(files))
idx = idx.tolist()
files

['dsgdb9nsd_000001.xyz',
 'dsgdb9nsd_000002.xyz',
 'dsgdb9nsd_000003.xyz',
 'dsgdb9nsd_000004.xyz',
 'dsgdb9nsd_000005.xyz',
 'dsgdb9nsd_000006.xyz',
 'dsgdb9nsd_000007.xyz',
 'dsgdb9nsd_000008.xyz',
 'dsgdb9nsd_000009.xyz',
 'dsgdb9nsd_000011.xyz',
 'dsgdb9nsd_000012.xyz',
 'dsgdb9nsd_000013.xyz',
 'dsgdb9nsd_000014.xyz',
 'dsgdb9nsd_000017.xyz',
 'dsgdb9nsd_000018.xyz',
 'dsgdb9nsd_000019.xyz',
 'dsgdb9nsd_000020.xyz',
 'dsgdb9nsd_000021.xyz',
 'dsgdb9nsd_000022.xyz',
 'dsgdb9nsd_000026.xyz',
 'dsgdb9nsd_000028.xyz',
 'dsgdb9nsd_000031.xyz',
 'dsgdb9nsd_000035.xyz',
 'dsgdb9nsd_000037.xyz',
 'dsgdb9nsd_000038.xyz',
 'dsgdb9nsd_000039.xyz',
 'dsgdb9nsd_000040.xyz',
 'dsgdb9nsd_000042.xyz',
 'dsgdb9nsd_000044.xyz',
 'dsgdb9nsd_000048.xyz',
 'dsgdb9nsd_000049.xyz',
 'dsgdb9nsd_000050.xyz',
 'dsgdb9nsd_000051.xyz',
 'dsgdb9nsd_000052.xyz',
 'dsgdb9nsd_000055.xyz',
 'dsgdb9nsd_000060.xyz',
 'dsgdb9nsd_000064.xyz',
 'dsgdb9nsd_000068.xyz',
 'dsgdb9nsd_000069.xyz',
 'dsgdb9nsd_000070.xyz',


In [None]:
valid_ids = [files[i] for i in idx[0:100]]
test_ids = [files[i] for i in idx[100:200]]
train_ids = [files[i] for i in idx[200:]]

data_train = utils.Qm9(root, train_ids, edge_transform=datasets.qm9_edges, e_representation=args.edge_rep)
data_valid = utils.Qm9(root, valid_ids, edge_transform=datasets.qm9_edges, e_representation=args.edge_rep)
data_test = utils.Qm9(root, test_ids, edge_transform=datasets.qm9_edges, e_representation=args.edge_rep)

# Define model and optimizer


In [7]:
print('Define model')
# Select one graph
g_tuple, l = data_train[0]
g, h_t, e = g_tuple

vals = utils.datasets.get_values(data_valid, 0, len(data_valid), ['target_mean','target_std'])
param = np.array([file_res['params'] for file_res in vals])
param

Define model


array([[  3.84000000e+00,   5.02400000e+01,  -2.42900000e-01, ...,
         -3.07571489e+02,  -3.07612431e+02,   2.52790000e+01],
       [  3.61930000e+00,   3.91300000e+01,  -2.25300000e-01, ...,
         -2.26156144e+02,  -2.26187104e+02,   1.33710000e+01],
       [  1.77170000e+00,   7.52600000e+01,  -2.52700000e-01, ...,
         -4.16893279e+02,  -4.16933921e+02,   2.81210000e+01],
       ..., 
       [  1.76750000e+00,   2.40400000e+01,  -2.68200000e-01, ...,
         -1.53738466e+02,  -1.53766642e+02,   9.17600000e+00],
       [  1.32230000e+00,   5.15000000e+01,  -2.52500000e-01, ...,
         -3.81625036e+02,  -3.81667220e+02,   2.62090000e+01],
       [  3.24170000e+00,   7.57400000e+01,  -2.22100000e-01, ...,
         -4.34146769e+02,  -4.34187834e+02,   3.08010000e+01]])

# Statistics

In [8]:
print('\tStatistics')
stat_dict = datasets.get_graph_stats(data_valid, ['target_mean', 'target_std'], multiprocess_p=False)
stat_dict

	Statistics


{'target_mean': array([  2.69689900e+00,   6.26179000e+01,  -2.40788000e-01,
         -1.22300000e-03,   2.39561000e-01,   9.14352607e+02,
          1.17970900e-01,  -3.52318540e+02,  -3.52311440e+02,
         -3.52310496e+02,  -3.52349674e+02,   2.59112000e+01]),
 'target_std': array([  1.90566044e+00,   1.67299833e+01,   2.74417648e-02,
          4.65505786e-02,   5.93242175e-02,   4.01016306e+02,
          3.57222303e-02,   9.62331120e+01,   9.62319063e+01,
          9.62319063e+01,   9.62355670e+01,   6.76316899e+00])}

In [9]:
data_train.set_target_transform(lambda x: datasets.normalize_data(x,stat_dict['target_mean'],
                                                                            stat_dict['target_std']))
data_valid.set_target_transform(lambda x: datasets.normalize_data(x, stat_dict['target_mean'],
                                                                            stat_dict['target_std']))
data_test.set_target_transform(lambda x: datasets.normalize_data(x, stat_dict['target_mean'],
                                                                           stat_dict['target_std']))

In [11]:
    train_loader = torch.utils.data.DataLoader(data_train,
                                               batch_size=args.batch_size, shuffle=True,
                                               collate_fn=datasets.collate_g,
                                               num_workers=args.prefetch, pin_memory=True)
    valid_loader = torch.utils.data.DataLoader(data_valid,
                                               batch_size=args.batch_size, collate_fn=datasets.collate_g,
                                               num_workers=args.prefetch, pin_memory=True)
    test_loader = torch.utils.data.DataLoader(data_test,
                                              batch_size=args.batch_size, collate_fn=datasets.collate_g,
                                              num_workers=args.prefetch, pin_memory=True)


# Create model

In [13]:
    in_n = [len(h_t[0]), len(list(e.values())[0])]
    hidden_state_size = 73
    message_size = 73
    n_layers = 3
    l_target = len(l)
    type ='regression'
    if args.model == 'MPNNv2':
        model = MPNNv2(in_n, [5, 15, 15], [10, 20, 20], l_target, type=type)
    elif args.model == 'MPNNv3':
        model = MPNNv3([1, 2, 3, 4], in_n, [5, 15, 15], 30, l_target, type=type)
    else:
        model = MPNN(in_n, hidden_state_size, message_size, n_layers, l_target, type=type)
    del in_n, hidden_state_size, message_size, n_layers, l_target, type

    print('Optimizer')
    optimizer = optim.Adam(model.parameters(), lr=args.lr)

    criterion = nn.MSELoss()

    evaluation = lambda output, target: torch.mean(torch.abs(output - target) / torch.abs(target))

    print('Logger')
    logger = Logger(args.logPath)

    lr_step = (args.lr-args.lr*args.lr_decay)/(args.epochs*args.schedule[1] - args.epochs*args.schedule[0])


Optimizer
Logger


# get the best checkpoint if available without training

In [14]:
if args.resume:
        checkpoint_dir = args.resume
        best_model_file = os.path.join(checkpoint_dir, 'model_best.pth')
        if not os.path.isdir(checkpoint_dir):
            os.makedirs(checkpoint_dir)
        if os.path.isfile(best_model_file):
            print("=> loading best model '{}'".format(best_model_file))
            checkpoint = torch.load(best_model_file)
            args.start_epoch = checkpoint['epoch']
            best_acc1 = checkpoint['best_er1']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded best model '{}' (epoch {})".format(best_model_file, checkpoint['epoch']))
        else:
            print("=> no best model found at '{}'".format(best_model_file))


=> loading best model './checkpoint/qm9/mpnn/model_best.pth'


AssertionError: Torch not compiled with CUDA enabled