In [34]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [12]:
%%file inits.py
# %load inits.py
import tensorflow as tf
import numpy as np


def uniform(shape, scale=0.05, name=None):
    """Uniform init."""
    initial = tf.random_uniform(shape, minval=-scale, maxval=scale, dtype=tf.float32)
    return tf.Variable(initial, name=name)

def glorot(shape, name=None):
    """Glorot & Bengio (AISTATS 2010) init."""
    init_range = np.sqrt(6.0/(shape[0]+shape[1]))
    initial = tf.random_uniform(shape, minval=-init_range, maxval=init_range, dtype=tf.float32)
    return tf.Variable(initial, name=name)

def zeros(shape, name=None):
    """All zeros."""
    initial = tf.zeros(shape, dtype=tf.float32)
    return tf.Variable(initial, name=name)

def ones(shape, name=None):
    """All ones."""
    initial = tf.ones(shape, dtype=tf.float32)
    return tf.Variable(initial, name=name)

Overwriting inits.py


In [13]:
%%file utils.py
# %load utils.py
import numpy as np
import pickle as pkl
import networkx as nx
from scipy.sparse.linalg.eigen.arpack import eigsh
import sys
import scipy.sparse as sp
import tensorflow as tf
import os

seed = 123
np.random.seed(seed)


def test_inputs(features, support, labels):
    if np.isnan(features).any(): raise ValueError("Features contains nan")
    if np.isnan(support).any(): raise ValueError("Support contains nan")
    if np.isnan(labels).any(): raise ValueError("Labels contains nan")
    if np.isinf(features).any(): raise ValueError("Features contains inf")
    if np.isinf(support).any(): raise ValueError("Support contains inf")
    if np.isinf(labels).any(): raise ValueError("Labels contains inf")

        
def parse_index_file(filename):
    """Parse index file."""
    index = []
    for line in open(filename):
        index.append(int(line.strip()))
    return index


def sample_mask(idx, l):
    """Create mask."""
    mask = np.zeros(l)
    mask[idx] = 1
    return np.array(mask, dtype=np.bool)


def load_data(dataset_str):
    """
    Loads input data from gcn/data directory

    ind.dataset_str.x => arr of the feature vectors of the training instances as numpy.ndarray object;
    ind.dataset_str.y => arr of the one-hot labels of the labeled training instances as numpy.ndarray object (|label| = number of classes); 
    ind.dataset_str.graph => arr of adjacency matrices as numpy objects
    ind.dataset_str.test.index => index file for test values. To ensure we properly do ONE split for all possible hyperparameters
    it simply is in Data/ind.all.test.index. This is NOT regenerated

    All objects above must be saved using python pickle module.

    :param dataset_str: Dataset name
    :return: All data input files loaded (as well the training/test data).
    """
    cwd = os.getcwd()
    os.chdir("..")
    names = ['x', 'y', 'graph', 'sequences', 'labelorder']
    objects = []
    for i in range(len(names)):
        with open("Data/ind.{}.{}".format(dataset_str, names[i]), 'rb') as f:
            if sys.version_info > (3, 0):
                objects.append(pkl.load(f, encoding='latin1'))
            else:
                objects.append(pkl.load(f))

    features, y_arr, adj_ls, sequences, labelorder = tuple(objects)
    os.chdir(cwd)
    
    # Split all datasets into testing, training, and validation. The split of this data is fixed for each dataset
    # because the numpy seed is fixed, currently the breakdown is train: 60, validation: 10, test: 30
    idx = [y_ind for y_ind in range(y_arr.shape[0])]
    np.random.shuffle(idx)
    cutoff_1 = int(6*len(idx)/10)
    cutoff_2 = int(7*len(idx)/10)
    idx_train = idx[:cutoff_1]
    idx_val = idx[cutoff_1:cutoff_2]
    idx_test = idx[cutoff_2:]
    idx_train, idx_val, idx_test = np.sort(idx_train), np.sort(idx_val), np.sort(idx_test)
    
    # make logical indices (they are the size BATCH)
    train_mask = sample_mask(idx_train, y_arr.shape[0])
    val_mask = sample_mask(idx_val, y_arr.shape[0])
    test_mask = sample_mask(idx_test, y_arr.shape[0])

    return adj_ls, features, y_arr, sequences, labelorder, train_mask, val_mask, test_mask


def parse_many_datasets(datasets):
    """This method deals with many datasets being provided. The datasets MUST have the cardinality of node set."""
    datasets = datasets.strip()
    if datasets[0] != "[" and datasets[-1] != "]":
        return load_data(datasets)
    datasets = datasets.strip("[]")
    datasets = datasets.replace(" ", "")
    datasets = datasets.split(",")
    
    # initialize with initial or first dataset, then simply concatenate each new dataset onto existing structure
    adj_ls, features, y_arr, sequences, labelorder, train_mask, val_mask, test_mask = load_data(datasets[0])
    for dataset in datasets[1:]:
        if dataset != "":
            adj_ls_curr, features_curr, y_arr_curr, sequences_curr, _, train_curr, val_curr, test_curr = load_data(dataset)
            adj_ls = np.concatenate((adj_ls, adj_ls_curr), axis = 0)
            features = np.concatenate((features, features_curr), axis = 0)
            y_arr = np.concatenate((y_arr, y_arr_curr), axis = 0)
            train_mask = np.concatenate((train_mask, train_curr), axis = 0)
            val_mask = np.concatenate((val_mask, val_curr), axis = 0)
            test_mask = np.concatenate((test_mask, test_curr), axis = 0)
            sequences = sequences + sequences_curr
    return adj_ls, features, y_arr, sequences, labelorder, train_mask, val_mask, test_mask
        

def preprocess_features(features):
    """Row-normalize feature matrix and convert to tuple representation"""
    for i in range(features.shape[0]):
        feature_arr = features[i,:,:]
        rowsum = np.array(feature_arr.sum(1))
        r_inv = np.power(rowsum, -1).flatten()
        r_inv[np.isinf(r_inv)] = 0.
        r_mat_inv = np.diag(r_inv)
        features[i,:,:] = r_mat_inv.dot(feature_arr)
    return features


def normalize_adj(adj):
    """Symmetrically normalize adjacency matrix."""
    # added a shift to be non negative
    adj += np.amax(adj)
    # normalize
    rowsum = adj.sum(1)
    d_inv_sqrt = np.power(rowsum, -0.5).flatten()
    d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.
    d_mat_inv_sqrt = np.diag(d_inv_sqrt)
    return adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt)


def preprocess_adj(adj):
    """Preprocessing of adjacency matrix for simple GCN model and conversion to tuple representation."""
    adj_normalized = normalize_adj(adj + np.identity(adj.shape[0]))
    return adj_normalized


def construct_feed_dict(features, support, labels, placeholders):
    """Construct feed dictionary."""
    feed_dict = dict()
    feed_dict.update({placeholders['labels']: labels})
    feed_dict.update({placeholders['features']: features})
    feed_dict.update({placeholders['support']: support})
    return feed_dict


def chebyshev_polynomials(adj, k):
    """Calculate Chebyshev polynomials up to order k. Return a list of sparse matrices (tuple representation)."""

    adj_normalized = normalize_adj(adj)
    laplacian = np.identity(adj.shape[0]) - adj_normalized
    try:
        largest_eigval, _ = eigsh(laplacian, 1, which='LM') # should still work
    except:
        largest_eigval, _ = eigsh(laplacian, 1, which='LM') # should still work, some wierd bug
        
    scaled_laplacian = (2. / largest_eigval[0]) * laplacian - np.identity(adj.shape[0])

    t_k = list()
    t_k.append(np.identity(adj.shape[0]))
    t_k.append(scaled_laplacian)

    def chebyshev_recurrence(t_k_minus_one, t_k_minus_two, scaled_lap):
        s_lap = sp.csr_matrix(scaled_lap, copy=True)
        return 2 * s_lap.dot(t_k_minus_one) - t_k_minus_two

    for i in range(2, k+1):
        t_k.append(chebyshev_recurrence(t_k[-1], t_k[-2], scaled_laplacian))

    return t_k

Overwriting utils.py


In [14]:
%%file layers.py
# %load layers.py
from inits import *
import tensorflow as tf

flags = tf.app.flags
FLAGS = flags.FLAGS

# global unique layer ID dictionary for layer name assignment
_LAYER_UIDS = {}


def get_layer_uid(layer_name=''):
    """Helper function, assigns unique layer IDs."""
    if layer_name not in _LAYER_UIDS:
        _LAYER_UIDS[layer_name] = 1
        return 1
    else:
        _LAYER_UIDS[layer_name] += 1
        return _LAYER_UIDS[layer_name]

def dot(x, y, sparse=False):
    """Wrapper for tf.matmul (sparse vs dense)."""
    if sparse:
        res = tf.sparse_tensor_dense_matmul(x, y)
    else:
        res = tf.matmul(x, y)
    return res    
    
class Layer(object):
    """Base layer class. Defines basic API for all layer objects.
    Implementation inspired by keras (http://keras.io).

    # Properties
        name: String, defines the variable scope of the layer.
        logging: Boolean, switches Tensorflow histogram logging on/off

    # Methods
        _call(inputs): Defines computation graph of layer
            (i.e. takes input, returns output)
        __call__(inputs): Wrapper for _call()
        _log_vars(): Log all variables
    """

    def __init__(self, **kwargs):
        allowed_kwargs = {'name', 'logging'}
        for kwarg in kwargs.keys():
            assert kwarg in allowed_kwargs, 'Invalid keyword argument: ' + kwarg
        name = kwargs.get('name')
        if not name:
            layer = self.__class__.__name__.lower()
            name = layer + '_' + str(get_layer_uid(layer))
        self.name = name
        self.vars = {}
        logging = kwargs.get('logging', False)
        self.logging = logging

    def _call(self, inputs):
        return inputs

    def __call__(self, inputs):
        with tf.name_scope(self.name):
            if self.logging:
                tf.summary.histogram(self.name + '/inputs', inputs)
            outputs = self._call(inputs)
            if self.logging:
                tf.summary.histogram(self.name + '/outputs', outputs)
            return outputs

    def _log_vars(self):
        for var in self.vars:
            tf.summary.histogram(self.name + '/vars/' + var, self.vars[var])

class Flatten(Layer):
    """Flattens a tensor layer."""
    def __init__(self, **kwargs):
        super(Flatten, self).__init__(**kwargs)

    def _call(self, inputs):
        x = inputs
        # flatten the tensor to one layer
        shape = x.get_shape().as_list()               # a list: [None,...]
        dim = np.prod(shape[1:])                   # dim = prod(...)
        x_flattened = tf.reshape(x, [-1, dim])        # -1 means "all"
        return x_flattened
    
class Dense(Layer):
    """Dense layer."""
    def __init__(self, input_dim, output_dim, placeholders, dropout=0.,
                 act=tf.nn.relu, bias=False, featureless=False, **kwargs):
        super(Dense, self).__init__(**kwargs)

        if dropout:
            self.dropout = placeholders['dropout']
        else:
            self.dropout = 0.

        self.act = act
        self.featureless = featureless
        self.bias = bias

        with tf.variable_scope(self.name + '_vars'):
            self.vars['weights'] = glorot([input_dim, output_dim],
                                          name='weights')
            if self.bias:
                self.vars['bias'] = zeros([output_dim], name='bias')

        if self.logging:
            self._log_vars()

    def _call(self, inputs):
        x = inputs

        # dropout
        x = tf.nn.dropout(x, 1-self.dropout)
        
        # transform
        output = x @ self.vars['weights']
        
        # bias
        if self.bias:
            output += self.vars['bias']
    
        return self.act(output) # BatchOutput

class GraphConvolution(Layer):
    """Graph convolution layer."""
    def __init__(self, input_dim, output_dim, placeholders, dropout=0.,
                 act=tf.nn.relu, bias=False,
                **kwargs):
        super(GraphConvolution, self).__init__(**kwargs)

        if dropout:
            self.dropout = placeholders['dropout']
        else:
            self.dropout = 0.

        self.act = act
        self.support = placeholders['support']
        self.bias = bias
        self.num_nodes = self.support.get_shape().as_list()[2]
        self.output_dim = output_dim
        
        with tf.variable_scope(self.name + '_vars'):
            # make all weight matrices for supports in convolution
            for i in range(self.support.get_shape().as_list()[1]):# support: ?xSupportsxNxNxM
                for j in range(self.support.get_shape().as_list()[4]):
                    tensor_name = 'weights_support_' + str(i) + '_M_' + str(j)
                    self.vars[tensor_name] = glorot([input_dim, output_dim], name=tensor_name)
            # make vector to do weighted sum of all convolved features (w in SUM(wi*(NxF')) for w in M)
            self.vars["Features Combination"] = tf.Variable(tf.random_uniform([self.support.get_shape().as_list()[4]]))
            # make bias matrice
            if self.bias:
                self.vars['bias'] = zeros([output_dim], name='bias')
        if self.logging:
            self._log_vars()

    def _call(self, inputs):
        x = inputs
        
        # dropout
        x = tf.nn.dropout(x, 1-self.dropout)
        
        # convolve
        convolved_features = []
        for j in range(self.support.get_shape().as_list()[4]):
            temp = []
            for i in range(self.support.get_shape().as_list()[1]):
                tensor_name = 'weights_support_' + str(i) + '_M_' + str(j)
                # BatchNF * FF' weight tensor (num_nodes, |Features'|)
                (N, F) = x.get_shape().as_list()[1:]
                embed = tf.reshape(x, [-1, F])
                pre_sup =  tf.reshape(tf.reshape(x, [-1, F]) @ self.vars[tensor_name], [-1, N, self.output_dim])
                (batch, _, F_new) = pre_sup.get_shape().as_list()

                # BatchNN * BatchNF' => BatchNF'
                support = tf.slice(self.support, [0,i,0,0,j], [-1,1,-1,-1,1]) # get Batch1NN1
                support = tf.reshape(support, [-1,N,N]) # reshape to BatchNN
                support = support @ pre_sup # now BatchNF'
                temp.append(support)
            # adds together list of BatchNF' into one BatchNF' for a single original adjacency matrix
            convolved_F = tf.add_n(temp)
            convolved_features.append(convolved_F)
        # stack list into one tensor of shape BatchNF'M
        convolved_features = tf.stack(convolved_features, axis = 3)
        # do weighted multiplication
        convolved_features = tf.multiply(convolved_features, self.vars["Features Combination"])
        # sum together to remove 4th dimension
        output = tf.reduce_sum(convolved_features, axis = 3)
        
        # bias
        if self.bias:
            output += self.vars['bias'] # Broadcasting spreads bias across Batch and Node dimensions

        return self.act(output)

class SelfAttention(Layer):
    """Self attention layer, input is in ?xNxhidden, output is in ?x(Bias*Hidden). Hidden should correspond 
    to the number of features nodes have."""
    
    def __init__(self, attention_dim, bias_dim, hidden_units, placeholders, dropout=0., **kwargs):
        super().__init__(**kwargs)
        
        if dropout:
            self.dropout = placeholders['dropout']
        else:
            self.dropout = 0.
        
        self.hidden_units = hidden_units
        self.A = None
        with tf.variable_scope(self.name + '_vars'):
            self.vars['Ws'] = glorot([attention_dim, self.hidden_units])#tf.Variable(tf.random_uniform([attention_dim, self.hidden_units])) # AttentionxHidden
            self.vars['W2'] = glorot([bias_dim, attention_dim])#tf.Variable(tf.random_uniform([bias_dim, attention_dim])) # BiasxAttention

    def _call(self, inputs):
        # dropout
        inputs = tf.nn.dropout(inputs, 1-self.dropout)
        inputsT = tf.transpose(inputs, perm = [0, 2, 1]) # transpose the inner matrices which is our intention
        
        # AttentionxHidden * ?xHiddenxN => ?xAttentionxN
        aux = tf.einsum('ah,bhn->ban', self.vars['Ws'], inputsT)
        aux = tf.tanh(aux)
        
        # BiasxAttention * ?xAttentionxN => ?xBiasxN
        self.A = tf.einsum('ba,uan->ubn',self.vars['W2'], aux)
        self.A = tf.nn.softmax(self.A)
        
        # ?xBiasxN * ?xNxHidden => ?xBiasxHidden
        out = self.A @ inputs
        
        # ?xBiasxHidden => ?x(Bias*Hidden)
        out = tf.reshape(out, [ -1, out.get_shape().as_list()[1] * out.get_shape().as_list()[2]])
        return out

Overwriting layers.py


In [15]:
%%file models.py
# %load models.py
from layers import *
from metrics import *

flags = tf.app.flags
FLAGS = flags.FLAGS

class Model(object):
    def __init__(self, **kwargs):
        allowed_kwargs = {'name', 'logging'}
        for kwarg in kwargs.keys():
            assert kwarg in allowed_kwargs, 'Invalid keyword argument: ' + kwarg
        name = kwargs.get('name')
        if not name:
            name = self.__class__.__name__.lower()
        self.name = name

        logging = kwargs.get('logging', False)
        self.logging = logging

        self.vars = {}
        self.placeholders = {}

        self.layers = []
        self.activations = []
        
        self.inputs = None
        self.outputs = None
        self.logits = None
        self.predictions = None
        self.attentions = None
        
        self.loss = 0
        self.accuracy = 0
        self.f1_score = 0
        self.optimizer = None
        self.opt_op = None

    def _build(self):
        raise NotImplementedError

    def build(self):
        """ Wrapper for _build() """
        with tf.variable_scope(self.name):
            self._build()

        # Build sequential layer model
        self.activations.append(self.inputs)
        for layer in self.layers:
            hidden = layer(self.activations[-1])
            self.activations.append(hidden)
        self.outputs = self.activations[-1]

        # Store model variables for easy access
        variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.name)
        self.vars = {var.name: var for var in variables}

        # Build metrics
        self._loss()
        self._accuracy()
        self.opt_op = self.optimizer.minimize(self.loss)

    def predict(self):
        pass

    def _loss(self):
        raise NotImplementedError

    def _accuracy(self):
        raise NotImplementedError

    def save(self, sess=None):
        if not sess:
            raise AttributeError("TensorFlow session not provided.")
        saver = tf.train.Saver(self.vars)
        save_path = saver.save(sess, "tmp/%s.ckpt" % self.name)
        print("Model saved in file: %s" % save_path)

    def load(self, sess=None):
        if not sess:
            raise AttributeError("TensorFlow session not provided.")
        saver = tf.train.Saver(self.vars)
        save_path = "tmp/%s.ckpt" % self.name
        saver.restore(sess, save_path)
        print("Model restored from file: %s" % save_path)


class GCN(Model):
    def __init__(self, placeholders, input_dim, **kwargs):
        super(GCN, self).__init__(**kwargs)
        self.inputs = placeholders['features']
        self.input_dim = input_dim
        self.output_dim = placeholders['labels'].get_shape().as_list()[1]
        self.placeholders = placeholders
        self.number_nodes = placeholders['features'].get_shape().as_list()[1]
        self.optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.learning_rate)
        self.attention_layer = None
        self.build()

    def _loss(self):
        # Weight decay loss
        for var in self.layers[0].vars.values():
            self.loss += FLAGS.weight_decay * tf.nn.l2_loss(var)
        
        if FLAGS.balanced_training == "True":
            labels = self.placeholders['labels']
            logits = self.outputs
            # Get relative frequency of each class
            class_counts = tf.reduce_sum(labels, 0)
            class_frequencies = class_counts / tf.reduce_sum(class_counts)
            # Cross entropy error
            entropies = tf.nn.softmax_cross_entropy_with_logits_v2(logits = logits, labels = labels, dim = -1)
            # Scale by 1/frequency
            scalers = labels / class_frequencies
            scalers = tf.reduce_sum(scalers, 1)
            entropies_scaled = scalers * entropies
            self.loss += tf.reduce_mean(entropies_scaled)
        else:
            # Cross entropy error
            labels = self.placeholders['labels']
            logits = self.outputs
            entropies = tf.nn.softmax_cross_entropy_with_logits_v2(logits = logits, labels = labels, dim = -1)
            loss = tf.reduce_mean(entropies)
            self.loss += loss

    def _accuracy(self):
        labels = self.placeholders['labels']
        logits = self.outputs
        self.logits = logits
        self.attentions = self.attention_layer.A
        labels=tf.argmax(labels, 1) # labels
        predictions=tf.argmax(logits, 1) # prediction as one hot
        self.predictions = predictions
        
        # Define the metric and update operations, f1 score is also calculated
        tf_metric, tf_metric_update = tf.metrics.accuracy(predictions = predictions, labels = labels, name = "accuracy")
        self.accuracy = tf_metric_update
        
        # Isolate the variables stored behind the scenes by the metric operation
        running_vars = tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES, scope="accuracy")
        
        # Define initializer to initialize/reset running variables
        self.running_vars_initializer = tf.variables_initializer(var_list=running_vars)

    def _build(self):
        # Parse through the graph convolutional layer specifications and the hidden layers
        def parse_array(string):
            sl = list(string)
            if sl[0] != "[" and sl[-1] == "]":
                raise ValueError("Invalid dimensions input")
            string = string.strip("[]")
            string = string.replace(" ", "")
            num_ls = string.split(",")
            return [int(x) for x in num_ls if x != ""]
        graph_convolution_dimensions = parse_array(FLAGS.graph_conv_dimensions)
        fully_connected_dimensions = parse_array(FLAGS.connected_dimensions)
        
        # Graph Convolutional Layers
        prior_dimension = self.input_dim
        for gcdim in graph_convolution_dimensions:
            self.layers.append(GraphConvolution(input_dim=prior_dimension,
                                                output_dim=gcdim,
                                                placeholders=self.placeholders,
                                                act=tf.nn.relu,
                                                dropout=True,
                                                logging=self.logging))
            prior_dimension = gcdim
        
        # Self Attention
        self.layers.append(SelfAttention(attention_dim=FLAGS.attention_dim,
                                         bias_dim=FLAGS.attention_bias, 
                                         hidden_units=prior_dimension,
                                         placeholders=self.placeholders,
                                         dropout=True,
                                         logging=self.logging))
        self.attention_layer = self.layers[-1]
        
        # Fully Connected Layers
        fully_connected_dimensions.append(self.output_dim)
        prior_dimension = prior_dimension * FLAGS.attention_bias
        for fcdim in fully_connected_dimensions:
            self.layers.append(Dense(input_dim=prior_dimension,
                                     output_dim=fcdim,
                                     act=tf.nn.relu,
                                     placeholders=self.placeholders,
                                     dropout=True,
                                     logging=self.logging))
            prior_dimension = fcdim
        
    def predict(self):
        logits = self.outputs
        return tf.nn.softmax(logits)


Overwriting models.py


In [16]:
%%file train.py
# %load train.py

import time
import tensorflow as tf
import numpy as np
from utils import *
from models import GCN
import pandas as pd
import os
import random

# Set random seed
seed = 123
np.random.seed(seed)
tf.set_random_seed(seed)

# Settings
flags = tf.app.flags
FLAGS = flags.FLAGS
flags.DEFINE_string('dataset', 'wholeset', 'Dataset string.')
flags.DEFINE_string('model', 'gcn', 'Model string.')
flags.DEFINE_float('learning_rate', 0.01, 'Initial learning rate.')
flags.DEFINE_integer('epochs', 200, 'Number of epochs to train.')
flags.DEFINE_string('graph_conv_dimensions', '[20,20]', 'Number of units in each graph convolution layer.')
flags.DEFINE_string('connected_dimensions','[]', 'Number of units in each FC layer.')
flags.DEFINE_integer('attention_bias', 2, 'Attention Bias.')
flags.DEFINE_integer('attention_dim', 5, 'Attention Dimension.')
flags.DEFINE_float('dropout', 0.5, 'Dropout rate (1 - keep probability).')
flags.DEFINE_float('weight_decay', 5e-4, 'Weight for L2 loss on embedding matrix.')
flags.DEFINE_integer('early_stopping', 10, 'Tolerance for early stopping (# of epochs).')
flags.DEFINE_integer('max_degree', 3, 'Maximum Chebyshev polynomial degree.')
flags.DEFINE_string('save_validation', "False", "If you should save validation accuracy")
flags.DEFINE_string('save_test', "False", "If this is a optimized run! Use all data and save outputs")
flags.DEFINE_string('test_dataset', 'testset', "If we are testing with a unique test_set")
flags.DEFINE_string('balanced_training', 'False', "use a weighted classwise loss to prevent favoring larger class")

# Load data
adj_ls, features, y_arr, sequences, labelorder, train_mask, val_mask, test_mask = parse_many_datasets(FLAGS.dataset)

# Check for independent test_dataset
if FLAGS.test_dataset != "testset":
    adj_ls_test, features_test, y_arr_test, sequences_test, _, _, _, test_mask = parse_many_datasets(FLAGS.test_dataset)
    adj_ls = np.concatenate((adj_ls, adj_ls_test), axis = 0)
    features = np.concatenate((features, features_test), axis = 0)
    y_arr = np.concatenate((y_arr, y_arr_test), axis = 0)
    sequences = sequences + sequences_test
    # make all the indices true and false, then concatenate and invert for test and train
    test_mask[0:len(test_mask)] = True
    train_mask[0:len(train_mask)] = False
    # test is now indexes of testset
    test_mask = np.concatenate((train_mask, test_mask))
    # make train test split
    train_mask = np.array([not xi for xi in test_mask], dtype = np.bool)
    idx = [i for i in range(sum(train_mask))]
    np.random.shuffle(idx)
    cutoff = int(6*len(idx)/7)
    val_ind = idx[cutoff:]
    train_ind = idx[:cutoff]
    val_mask = np.array([xi in val_ind for xi in range(train_mask.shape[0])], dtype = np.bool)
    train_mask = np.array([xi in train_ind for xi in range(train_mask.shape[0])], dtype = np.bool)

# Save Name Defined by Model Params
model_desc = "lr_{7}_epoch_{8}_stop_{9}_gc_{0}_do_{1}_ad_{2}_ab_{3}_fc_{4}_m_{5}_deg_{6}"
model_desc = model_desc.format(FLAGS.graph_conv_dimensions, FLAGS.dropout, FLAGS.attention_dim,
                              FLAGS.attention_bias, FLAGS.connected_dimensions, FLAGS.model, FLAGS.max_degree,
                              FLAGS.learning_rate, FLAGS.epochs, FLAGS.early_stopping)

# Determine Number of Supports and Assign Model Function
if FLAGS.model == 'gcn':
    num_supports = 1
    model_func = GCN
elif FLAGS.model == 'gcn_cheby':
    num_supports = 1 + FLAGS.max_degree
    model_func = GCN
else:
    raise ValueError('Invalid argument for model: ' + str(FLAGS.model))

# Validating
save_validation = FLAGS.save_validation
if save_validation == "True": save_validation = True
else: save_validation = False

# Testing
save_test = FLAGS.save_test
if save_test == "True": save_test = True
else: save_test = False

# Make Dataframes
if save_test:
    epoch_df = pd.DataFrame(np.zeros(shape = (FLAGS.epochs, 5)))
    labels_df = pd.DataFrame(np.zeros(shape = (sum(test_mask), 5)))

# Print Basic Information
print(f"Graph: {FLAGS.dataset}, {FLAGS.test_dataset}\nModel {model_desc}")

# Size of Different Sets
print("|Training| {}, |Validation| {}, |Testing| {}".format(np.sum(train_mask), np.sum(val_mask), np.sum(test_mask)))

# Initial time
ttot = time.time()

# Preload support tensor so that it isn't needlessly calculated many times
batch,_,N,M = adj_ls.shape
support_tensor = np.zeros(shape=(batch,num_supports,N,N,M)) # of shape (Batch,Num_Supports,Num_Nodes,Num_Nodes,Num_Edge)
if FLAGS.model == "gcn_cheby":
    print("Calculating Chebyshev polynomials up to order {}...".format(FLAGS.max_degree))
else:
    print("Preprocessing adjacency lists")
for b in range(batch):
    adj = adj_ls[b]
    for m in range(M):
        adj = adj_ls[b][:,:,m] # first adjacency list
        if FLAGS.model == 'gcn':
            support = [preprocess_adj(adj)]
        elif FLAGS.model == 'gcn_cheby':
            support = chebyshev_polynomials(adj, FLAGS.max_degree)
        # add NxN matrices along the num_supports dimension
        sup = np.stack(support, axis=0)
        # add num_supportsxNxN to support tensor
        support_tensor[b,:,:,:,m] = sup

# Normalize all features
features = preprocess_features(features)

# Test processed inputs
test_inputs(features, support_tensor, y_arr)

# Define placeholders
F = features.shape[2]
placeholders = {
    'support': tf.placeholder(tf.float32, shape=(None,num_supports,N,N,M)), # ?xnum_supportsxNxNxM
    'features': tf.placeholder(tf.float32, shape=(None,N,F)), # ?xNxF
    'labels': tf.placeholder(tf.float32, shape=(None, y_arr.shape[1])), # ?,|labels|
    'dropout': tf.placeholder_with_default(0., shape=())
}

# Define model evaluation function
def evaluate(features, support, labels, mask, placeholders, model):
    t_test = time.time()
    features = features[mask,:,:]
    support = support[mask,:,:,:]
    labels = labels[mask, :]
    feed_dict = construct_feed_dict(features, support, labels, placeholders)
    outs_val = sess.run([model.loss, model.accuracy], feed_dict=feed_dict)
    return outs_val[0], outs_val[1], (time.time() - t_test)

def optimize():
    # Train model
    print("\nOptimization of Stopping Conditions:")
    t = time.time()
    cost_ls = []
    last_improvement = 0
    best_accuracy = 0
    improved_str = ''
    for epoch in range(FLAGS.epochs):
        t_epoch = time.time()
        # Instantiate all inputs
        features_train = features[train_mask,:,:]
        support = support_tensor[train_mask,:,:,:]
        y_train = y_arr[train_mask, :]
        # Construct feed dictionary
        feed_dict = construct_feed_dict(features_train, support, y_train, placeholders)
        feed_dict.update({placeholders['dropout']: FLAGS.dropout})
        # Reset the counters
        sess.run(model.running_vars_initializer)
        # Training step
        outs = sess.run([model.opt_op, model.loss, model.accuracy], feed_dict=feed_dict)
        # Reset the counters
        sess.run(model.running_vars_initializer)
        # Validation evaluation
        cost, acc, duration = evaluate(features, support_tensor, y_arr, val_mask, placeholders, model)
        cost_ls.append(cost)
        # Save the model IF validation is sufficiently accurate
        if acc > best_accuracy:
            best_accuracy = acc
            last_improvement = epoch
            saver.save(sess=sess, save_path=save_path_val)
            improved_str += '*'
        # Print results
        if (epoch + 1) % 20 == 0:
            print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(outs[1]),"train_acc=", "{:.5f}".format(outs[2]),
                  "val_loss=", "{:.5f}".format(cost), "val_acc=", "{:.5f}".format(acc),"time=", "{:.5f}".format(time.time() - t), improved_str)
            t = time.time()
            improved_str = ''
        if epoch > FLAGS.early_stopping and epoch - last_improvement > 200:
            print("Early stopping...")
            break
    print("Optimization Finished! Total Time: {} sec".format(time.time() - ttot))
    return best_accuracy, epoch

def testing_results(epoch_final):
    # Initialize session
    print("\nTraining on test set:")
    sess.run(tf.global_variables_initializer())
    sess.run(model.running_vars_initializer)
    # Combine training and validation
    mask = np.array([x or y for (x,y) in zip(test_mask, val_mask)], dtype = np.bool)
    # Train model
    t = time.time()
    cost_ls = []
    last_improvement = 0
    best_accuracy = 0
    improved_str = ''
    for epoch in range(FLAGS.epochs):
        t_epoch = time.time()
        # Instantiate all inputs
        features_train = features[mask,:,:]
        support = support_tensor[mask,:,:,:]
        y_train = y_arr[mask, :]
        # Construct feed dictionary
        feed_dict = construct_feed_dict(features_train, support, y_train, placeholders)
        feed_dict.update({placeholders['dropout']: FLAGS.dropout})
        # Reset the counters
        sess.run(model.running_vars_initializer)
        # Training step
        outs = sess.run([model.opt_op, model.loss, model.accuracy], feed_dict=feed_dict)
        # Reset the counters
        sess.run(model.running_vars_initializer)
        # Evaluate
        cost, acc, duration = evaluate(features, support_tensor, y_arr, test_mask, placeholders, model)
        cost_ls.append(cost)
        epoch_df.iloc[epoch, :] = [outs[1], outs[2], cost, acc, time.time() - t_epoch]
        # Save the model IF training accuracy is a maximum
        if outs[2] > best_accuracy:
            best_accuracy = outs[2]
            last_improvement = epoch
            saver.save(sess=sess, save_path=save_path_test)
            improved_str += '*'
        # Print results
        if (epoch + 1) % 20 == 0:
            print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(outs[1]),"train_acc=", "{:.5f}".format(outs[2]),
                  "test_loss=", "{:.5f}".format(cost), "test_acc=", "{:.5f}".format(acc),"time=", "{:.5f}".format(time.time() - t), improved_str)
            improved_str = ''
            t = time.time()
        # Stop training when we hit old epoch number 
        if epoch > epoch_final and epoch - last_improvement > 200:
            print("Early stopping...")
            break
    print("Optimization for Test Finished! Total Time: {} sec".format(time.time() - ttot))
    return best_accuracy

# Create model
model = model_func(placeholders, input_dim=features.shape[2], logging=True)

# Initialize session
sess = tf.Session()
sess.run(tf.global_variables_initializer())
sess.run(model.running_vars_initializer)

# Make saver
saver = tf.train.Saver()
save_dir = 'checkpoints/'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

num = random.randint(100000,999999)
save_path_val = os.path.join(save_dir, f'best_validation_{num}')
save_path_val = os.path.join(os.getcwd(), save_path_val)
save_path_test = os.path.join(save_dir, f'best_training_{num}')
save_path_test = os.path.join(os.getcwd(), save_path_test)

# Do optimization on validation set
accuracy_validation, final_epoch = optimize()

# Validation
if save_validation:
    root = os.getcwd()
    os.chdir('..')
    results = os.path.join(os.getcwd(), "Results")
    os.chdir(root)
    txt = os.path.join(results, "validation_results.txt")
    if not os.path.exists(txt):
        with open(txt, "w+") as fh:
            fh.write("Dataset\tTest Dataset\tValidation Accuracy\tMax Epochs\tFinal Epoch\tModel\tMax Degree\tLearning Rate\tDropout\t")
            fh.write("Attention Dimension\tAttention Bias\tGraph Convolution Dimensions\tFully Connected Dimensions\t")
            fh.write("Balanced Training\tWeight Decay\tEarly Stopping\n")
    vals = [FLAGS.dataset, FLAGS.test_dataset, accuracy_validation, FLAGS.epochs,final_epoch, FLAGS.model, FLAGS.max_degree,
            FLAGS.learning_rate, FLAGS.dropout, FLAGS.attention_dim, FLAGS.attention_bias, FLAGS.graph_conv_dimensions,
            FLAGS.connected_dimensions, FLAGS.balanced_training, FLAGS.weight_decay, FLAGS.early_stopping]
    with open(txt, "a") as fh:
        string = ""
        for val in vals: string += str(val) + "\t"
        fh.write(string + "\n")

# Test
if save_test:
    # Train on validation and train set
    accuracy_train = testing_results(final_epoch)
    # Choose which model to use
    if accuracy_validation > accuracy_train:
        path = save_path_val
    else:
        path = save_path_test
    # Load model
    sess.run(tf.global_variables_initializer())
    saver.restore(sess=sess, save_path=path)
    # Evaluate
    test_cost, test_acc, test_duration = evaluate(features, support_tensor, y_arr, test_mask, placeholders, model)
    print("Test set results:", "cost=", "{:.5f}".format(test_cost),
      "accuracy=", "{:.5f}".format(test_acc), "time=", "{:.5f}".format(test_duration))
    # Saving results to a file
    epoch_df.columns = ["train_loss", "train_acc", "test_loss", "test_acc", "time"]
    labels_df.columns = ["Sequence", "Label", "Prediction", "Negative Class Logit", "Positive Class Logit"]
    # get test values
    features_test = features[test_mask,:,:]
    support_test = support_tensor[test_mask,:,:,:]
    labels_test = y_arr[test_mask, :]
    # add sequences
    labels_df.iloc[:, 0] = [sequences[i] for i in range(len(test_mask)) if test_mask[i]]
    # add true labels
    labels_df.iloc[:, 1] = [np.where(labels_test[i])[0] for i in range((sum(test_mask)))]
    # get logits in final layer and attention layer values
    feed_dict = construct_feed_dict(features_test, support_test, labels_test, placeholders)
    logits, predictions, attentions = sess.run([model.logits, model.predictions, model.attentions], feed_dict=feed_dict)
    labels_df.iloc[:, 3:5] = logits
    # get predictions
    labels_df.iloc[:, 2] = predictions
    # add attentions
    att = np.zeros(shape = (attentions.shape[0] * attentions.shape[1], attentions.shape[2]))
    for bat in range(attentions.shape[0]):
        att[bat*attentions.shape[1]:(bat + 1)*attentions.shape[1],:] = attentions[bat,:,:]
    attention_df = pd.DataFrame(att)
    seq_test = [sequences[i] for i in range(len(test_mask)) if test_mask[i]]
    bias_vals = []
    batch_vals = []
    s = []
    for i in range(attentions.shape[0]): bias_vals += list(range(attentions.shape[1]))
    for i in range(attentions.shape[0]): batch_vals += [i for j in range(attentions.shape[1])]
    for i in range(attentions.shape[0]): s += [seq_test[i] for j in range(attentions.shape[1])]
    attention_df["Bias"] = bias_vals
    attention_df["Batch"] = batch_vals
    attention_df["Sequence"] = s
    attention_df["N"] = attentions.shape[2]
    # change indices for labels to their names
    labels_df.iloc[:,1] = labels_df.iloc[:,1].map(lambda x: labelorder[x])
    labels_df.iloc[:,2] = labels_df.iloc[:,2].map(lambda x: labelorder[x])
    # write to file
    if FLAGS.test_dataset != "testset":
        datadesc = "train_" + FLAGS.dataset + "_test_" + FLAGS.test_dataset
    else:
        datadesc = FLAGS.dataset
    epoch_df.to_csv("../Results/{}.{}.epoch.csv".format(model_desc, datadesc), index = False)
    labels_df.to_csv("../Results/{}.{}.predictions.csv".format(model_desc, datadesc), index = False)
    attention_df.to_csv("../Results/{}.{}.attentions.csv".format(model_desc, datadesc), index = False)

for file in os.listdir(save_dir):
    if str(num) in file:
        os.remove(os.path.join(save_dir, file))

Overwriting train.py


In [17]:
!python3 train.py -balanced_training True -learning_rate .01 -epochs 20 -early_stopping 500 -graph_conv_dimensions [20] -connected_dimensions [] -dropout 0 -attention_dim 10 -attention_bias 2 -model gcn -max_degree 1 -dataset [selector_8_ang_ratio_0_params_all_onehot_distance,selector_8_ang_ratio_0_params_all_onehot_distance] -save_test False -save_validation True

Graph: [selector_8_ang_ratio_0_params_all_onehot_distance,selector_8_ang_ratio_0_params_all_onehot_distance], testset
Model lr_0.01_epoch_20_stop_500_gc_[20]_do_0.0_ad_10_ab_2_fc_[]_m_gcn_deg_1
|Training| 1992, |Validation| 332, |Testing| 996
Preprocessing adjacency lists
2019-07-29 18:49:10.598432: I tensorflow/core/platform/cpu_feature_guard.cc:140] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA

Optimization of Stopping Conditions:
Epoch: 0020 train_loss= 1.39334 train_acc= 0.25251 val_loss= 1.39248 val_acc= 0.29217 time= 16.72855 *
Optimization Finished! Total Time: 19.015727043151855 sec


In [9]:
!python3 train.py -learning_rate 0.01 -epochs 20 -early_stopping 200 -graph_conv_dimensions [20,20] -dropout 0 -attention_dim 10 -attention_bias 3 -model gcn_cheby -max_degree 3 -connected_dimensions [10,10] -dataset selector_8_ang_ratio_0_params_all_onehot_distance -save_test True -save_validation False

Graph selector_8_ang_ratio_0_params_all_onehot_distance, testset
Model lr_0.01_epoch_200_stop_200_gc_[20,20]_do_0.0_ad_10_ab_3_fc_[10,10]_m_gcn_cheby_deg_3
|Training| 996, |Validation| 166, |Testing| 498
Calculating Chebyshev polynomials up to order 3...
  d_inv_sqrt = np.power(rowsum, -0.5).flatten()
2019-07-29 18:30:34.379600: I tensorflow/core/platform/cpu_feature_guard.cc:140] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA

Optimization of Stopping Conditions:
Epoch: 0020 train_loss= 0.72028 train_acc= 0.27108 val_loss= 0.71686 val_acc= 0.25301 time= 25.53053 *
Epoch: 0040 train_loss= 0.61815 train_acc= 0.72892 val_loss= 0.58798 val_acc= 0.74699 time= 26.86422 **
Epoch: 0060 train_loss= 0.58068 train_acc= 0.72892 val_loss= 0.56265 val_acc= 0.74699 time= 26.78343 
Epoch: 0080 train_loss= 0.45686 train_acc= 0.78614 val_loss= 0.52249 val_acc= 0.76506 time= 27.19014 *
Epoch: 0100 train_loss= 0.39590 train_acc= 0.80221 val_loss= 0.42050 val_

In [27]:
import os
%time !python3 consolidate_results.py

CPU times: user 46.9 ms, sys: 46.9 ms, total: 93.8 ms
Wall time: 1.72 s


In [27]:
import numpy as np
from scipy.sparse.linalg.eigen.arpack import eigsh
laplacian = np.identity(26)
print(type(laplacian))
print(laplacian)
print(laplacian.shape)
eigsh(laplacian, 1, which='LM')

<class 'numpy.ndarray'>
[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0

(array([1.]), array([[ 0.06820068],
        [ 0.05824877],
        [-0.15169597],
        [ 0.03231936],
        [ 0.0931789 ],
        [ 0.00290181],
        [ 0.06703742],
        [ 0.36320261],
        [-0.26347516],
        [-0.05083581],
        [-0.10620215],
        [ 0.25611725],
        [-0.12235471],
        [ 0.20537812],
        [ 0.30986563],
        [ 0.24861462],
        [-0.13556621],
        [ 0.02169414],
        [-0.12080429],
        [ 0.45722021],
        [-0.14662624],
        [ 0.0022738 ],
        [-0.13244341],
        [ 0.02799337],
        [ 0.37835751],
        [ 0.18293973]]))

In [18]:
import pandas as pd
df = pd.read_csv("../Results/validation_results.txt",sep = "\t",index_col=False)
df

Unnamed: 0,Dataset,Test Dataset,Validation Accuracy,Max Epochs,Final Epoch,Model,Max Degree,Learning Rate,Dropout,Attention Dimension,Attention Bias,Graph Convolution Dimensions,Fully Connected Dimensions,Balanced Training,Weight Decay,Early Stopping
0,[protease_HCV_selector_k_nearest_ratio_0_param...,testset,0.902166,1000,557,gcn_cheby,3,0.005,0.0,10,1,"[20,]",[],False,0.0005,200
1,protease_HCV_selector_k_nearest_ratio_0_params_0,testset,0.900545,1000,464,gcn_cheby,3,0.005,0.0,10,1,"[20,]",[],False,0.0005,200
2,protease_HCV_A171T_selector_k_nearest_ratio_0_...,testset,0.938683,1000,407,gcn_cheby,3,0.005,0.0,10,1,"[20,]",[],False,0.0005,200
3,protease_HCV_D183A_selector_k_nearest_ratio_0_...,testset,0.885329,1000,401,gcn_cheby,3,0.005,0.0,10,1,"[20,]",[],False,0.0005,200
4,protease_HCV_R170K_A171T_D183A_selector_k_near...,testset,0.913743,1000,381,gcn_cheby,3,0.005,0.0,10,1,"[20,]",[],False,0.0005,200
5,[protease_HCV_selector_k_nearest_ratio_0_param...,testset,0.905733,1000,479,gcn_cheby,3,0.005,0.0,10,2,"[20,]",[],False,0.0005,200
6,[protease_HCV_selector_k_nearest_ratio_0_param...,testset,0.338853,1000,201,gcn_cheby,3,0.010,0.0,10,2,"[10,]","[20,20,]",False,0.0005,200
7,[protease_HCV_selector_k_nearest_ratio_0_param...,testset,0.338853,1000,201,gcn_cheby,3,0.005,0.0,10,2,"[10,]","[20,20,]",False,0.0005,200
8,[protease_HCV_selector_k_nearest_ratio_0_param...,testset,0.338853,1000,201,gcn_cheby,3,0.005,0.0,10,2,"[10,10,]","[20,20,]",False,0.0005,200
9,protease_HCV_selector_k_nearest_ratio_0_params_0,testset,0.280654,1000,201,gcn_cheby,3,0.010,0.0,10,2,"[10,]","[20,20,]",False,0.0005,200
