In [None]:
from ccgnet import experiment as exp
from ccgnet.finetune import *
from ccgnet import layers
from ccgnet.layers import *
import tensorflow as tf
import numpy as np
import time
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import KFold
from ccgnet.Dataset import Dataset, DataLoader
from ccgnet.finetune import Finetuning
import glob
import random

In [None]:
data1 = Dataset('data/CC_Table/ECC_Table.tab', mol_blocks_dir='data/Mol_Blocks.dir')
data1.make_graph_dataset(Desc=1, A_type='OnlyCovalentBond', hbond=0, pipi_stack=0, contact=0, make_dataframe=True, max_graph_size=160)
data2 = Dataset('data/CC_Table/ECC_Table-DataAug.tab', mol_blocks_dir='data/Mol_Blocks.dir')
data2.make_graph_dataset(Desc=1, A_type='OnlyCovalentBond', hbond=0, pipi_stack=0, contact=0, make_dataframe=True, max_graph_size=160)

In [None]:
cl20_test = eval(open('data/Test/Test_Samples/CL-20_Test.list').read())
tnt_test = eval(open('data/Test/Test_Samples/TNT_Test.list').read())
cv_set = eval(open('data/ECC_Finetuning_Set.list').read())   # The negative samples in this ECC set was randomly selected from the data/MEPS.csv, which was used in our work.

In [None]:
class CCGNet(object):
    def build_model(self, inputs, is_training, global_step):
        V = inputs[0]
        A = inputs[1]
        labels = inputs[2]
        mask = inputs[3]
        graph_size = inputs[4]
        tags = inputs[5]
        global_state = inputs[6]
        subgraph_size = inputs[7]
        # message passing 
        V, global_state = CCGBlock(V, A, global_state, subgraph_size, no_filters=64, mask=mask, num_updates=global_step, is_training=is_training)
        V, global_state = CCGBlock(V, A, global_state, subgraph_size, no_filters=16, mask=mask, num_updates=global_step, is_training=is_training)
        V, global_state = CCGBlock(V, A, global_state, subgraph_size, no_filters=64, mask=mask, num_updates=global_step, is_training=is_training)
        V, global_state = CCGBlock(V, A, global_state, subgraph_size, no_filters=16, mask=mask, num_updates=global_step, is_training=is_training)
        # readout
        V = ReadoutFunction(V, global_state, graph_size, num_head=2, is_training=is_training)
        # predict
        with tf.compat.v1.variable_scope('Predictive_FC_1') as scope:
            V = layers.make_embedding_layer(V, 256)
            V = layers.make_bn(V, is_training, mask=None, num_updates=global_step)
            V = tf.nn.relu(V)
            V = tf.compat.v1.layers.dropout(V, 0.457, training=is_training)
        with tf.compat.v1.variable_scope('Predictive_FC_2') as scope:
            V = layers.make_embedding_layer(V, 1024)
            V = layers.make_bn(V, is_training, mask=None, num_updates=global_step)
            V = tf.nn.relu(V)
            V = tf.compat.v1.layers.dropout(V, 0.457, training=is_training)
        with tf.compat.v1.variable_scope('Predictive_FC_3') as scope:
            V = layers.make_embedding_layer(V, 256)
            V = layers.make_bn(V, is_training, mask=None, num_updates=global_step)
            V = tf.nn.relu(V)
            V = tf.compat.v1.layers.dropout(V, 0.457, training=is_training)
        out = layers.make_embedding_layer(V, 2, name='final')
        return out, labels

In [None]:
start = time.time()
restore_path = './snapshot/CCGNet/CC_Dataset/*/'
for p in glob.glob(restore_path):
    restore_file = tf.train.latest_checkpoint(p)
    random.shuffle(cv_set)
    cv_samples = np.array(cv_set)
    kf = KFold(n_splits=5, shuffle=True)
    fold_5 = {}
    n = 0
    for train_ix,test_ix in kf.split(cv_samples):
        fold_5['fold-{}'.format(n)] = {}
        fold_5['fold-{}'.format(n)]['train'] = cv_samples[train_ix]
        fold_5['fold-{}'.format(n)]['valid'] = cv_samples[test_ix]
        n += 1
    dataset_name = 'ECC-'+p.split('/')[-2][-1]
    snapshot_path = 'finetuning_snapshot'
    model_name = 'CCGNet'
    for fold in ['fold-{}'.format(i) for i in range(5)]:
        print('\n################ {} ################'.format(fold))
        train_data1, valid_data1, test_data1 = data1.split(train_samples=fold_5[fold]['train'], 
                                                     valid_samples=fold_5[fold]['valid'], with_test=True, 
                                                     test_samples=list(set(cl20_test+tnt_test)))
        train_data2, valid_data2, test_data2 = data2.split(train_samples=fold_5[fold]['train'], 
                                                     valid_samples=fold_5[fold]['valid'], with_test=True, 
                                                     test_samples=list(set(cl20_test+tnt_test)))
        train_data = []
        for ix, i in enumerate(train_data1):
            train_data.append(np.concatenate([i, train_data2[ix]]))
        tf.reset_default_graph()
        model = CCGNet()
        model = Finetuning(model, train_data, valid_data1, with_test=True, test_data=test_data1, snapshot_path=snapshot_path, use_subgraph=True,
                           restore_file=restore_file, model_name=model_name, dataset_name=dataset_name+'/time_{}'.format(fold[-1]),
                           remove_keywords=['Predictive_FC_3', 'final'])
        history = model.fit(save_info=True, save_att=True, silence=0, 
                            metric='loss', early_stop=0, early_stop_cutoff=20)
    end = time.time()
    time_gap = end-start
    h = time_gap//3600
    h_m = time_gap%3600
    m = h_m//60
    s = h_m%60
    print('{}h {}m {}s'.format(int(h),int(m),round(s,2)))

In [None]:
from ccgnet.parselog import ParseTestLog, ParseTestLogEnsemble, get_info
import glob


PATH = glob.glob('{}/{}/*'.format(snapshot_path,model_name))
ENS = []
for i in PATH:
    print('#### '+i.split('/')[-1]+' ####')
    val_list_ = glob.glob(i+'/*/*val*')
    ens_ = ParseTestLogEnsemble([ParseTestLog(j) for j in val_list_])
    ens_.Reports

In [None]:
from ccgnet.parselog import ParseValidLog

val_list = glob.glob('{}/{}/*/*'.format(snapshot_path,model_name))
l = []
for i in val_list:
    l.append(ParseValidLog(i))

In [None]:
def get_test_log(p):
    length = len(p.split('/'))
    l = p.split('/')[:length-1]
    l.append('/model-val-info.txt')
    return '/'.join(l)
best10 = [get_test_log(i[1]) for i in sorted([(i.loss, i.logfile) for i in l])[:10]]

In [None]:
best10

In [None]:
ens = ParseTestLogEnsemble([ParseTestLog(i) for i in best10])
print('####### Mean ########')
ens.Reports
print('####### Bagging ########')
ens_bagging = ens.Bagging

In [None]:
from ccgnet.parselog import TestAccForEachMol
import pandas as pd


dic = {}
#print('\n######## TNT  ########')
tnt_bagging = TestAccForEachMol(tnt_test, best10, is_return=1, is_print=0)
dic['TNT'] = tnt_bagging[1]
#print('\n######## CL-20  ########')
cl20_bagging = TestAccForEachMol(cl20_test, best10, is_return=1, is_print=0)
dic['CL-20'] = cl20_bagging[1]
pd.DataFrame(dic)