In [49]:
dataset = 'R8'
learning_rate = 0.02  
epochs  = 100  # Number of epochs to train.
hidden1 = 200  # Number of units in hidden layer 1.
dropout = 0.5  # Dropout rate (1 - keep probability).
weight_decay = 0.   # Weight for L2 loss on embedding matrix.
early_stopping = 10 # Tolerance for early stopping (# of epochs).

In [50]:
import time
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import optimizers
from sklearn import metrics
import os
import numpy as np
import re
import sys
import pickle as pkl
import networkx as nx
import scipy.sparse as sp
from scipy.sparse.linalg.eigen.arpack import eigsh
from tensorflow.keras import layers

In [51]:
seed = 6606
np.random.seed(seed)
tf.random.set_seed(seed)
def masked_softmax_cross_entropy(preds, labels, mask):
    """
    Softmax cross-entropy loss with masking.
    """
    
    loss = tf.nn.softmax_cross_entropy_with_logits(logits=preds, labels=labels)
    
    mask = tf.cast(mask, dtype=tf.float32)
    mask /= tf.reduce_mean(mask)
    loss *= mask
    return tf.reduce_mean(loss)


def masked_accuracy(preds, labels, mask):
    """
    Accuracy with masking.
    """
    correct_prediction = tf.equal(tf.argmax(preds, 1), tf.argmax(labels, 1))
    accuracy_all = tf.cast(correct_prediction, tf.float32)
    mask = tf.cast(mask, dtype=tf.float32)
    mask /= tf.reduce_mean(mask)
    accuracy_all *= mask
    return tf.reduce_mean(accuracy_all)


In [52]:
def parse_index_file(filename):
    """Parse index file."""
    index = []
    for line in open(filename):
        index.append(int(line.strip()))
    return index


def sample_mask(idx, l):
    """Create mask."""
    mask = np.zeros(l)
    mask[idx] = 1
    return np.array(mask, dtype=np.bool)


def load_corpus(dataset_str):
    """
    Loads input corpus from gcn/data directory

    ind.dataset_str.x => the feature vectors of the training docs as scipy.sparse.csr.csr_matrix object;
    ind.dataset_str.tx => the feature vectors of the test docs as scipy.sparse.csr.csr_matrix object;
    ind.dataset_str.allx => the feature vectors of both labeled and unlabeled training docs/words
        (a superset of ind.dataset_str.x) as scipy.sparse.csr.csr_matrix object;
    ind.dataset_str.y => the one-hot labels of the labeled training docs as numpy.ndarray object;
    ind.dataset_str.ty => the one-hot labels of the test docs as numpy.ndarray object;
    ind.dataset_str.ally => the labels for instances in ind.dataset_str.allx as numpy.ndarray object;
    ind.dataset_str.adj => adjacency matrix of word/doc nodes as scipy.sparse.csr.csr_matrix object;
    ind.dataset_str.train.index => the indices of training docs in original doc list.

    All objects above must be saved using python pickle module.

    :param dataset_str: Dataset name
    :return: All data input files loaded (as well the training/test data).
    """

    names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'adj']
    objects = []
    for i in range(len(names)):
        with open('cleaned_data/' + dataset_str + '/graph/ind.' + dataset_str + '.' + names[i], 'rb') as f:
        # with open("./data/ind.{}.{}".format(dataset_str, names[i]), 'rb') as f:
            if sys.version_info > (3, 0):
                objects.append(pkl.load(f, encoding='latin1'))
            else:
                objects.append(pkl.load(f))

    x, y, tx, ty, allx, ally, adj = tuple(objects)
    # print(x.shape, y.shape, tx.shape, ty.shape, allx.shape, ally.shape)

    features = sp.vstack((allx, tx)).tolil()
    labels = np.vstack((ally, ty))
    # print(len(labels))

    train_idx_orig = parse_index_file('cleaned_data/' + dataset_str + '/graph/' + dataset_str + '.train.index')
    train_size = len(train_idx_orig)

    val_size = train_size - x.shape[0]
    test_size = tx.shape[0]

    idx_train = range(len(y))
    idx_val = range(len(y), len(y) + val_size)
    idx_test = range(allx.shape[0], allx.shape[0] + test_size)

    train_mask = sample_mask(idx_train, labels.shape[0])
    val_mask = sample_mask(idx_val, labels.shape[0])
    test_mask = sample_mask(idx_test, labels.shape[0])

    y_train = np.zeros(labels.shape)
    y_val = np.zeros(labels.shape)
    y_test = np.zeros(labels.shape)
    y_train[train_mask, :] = labels[train_mask, :]
    y_val[val_mask, :] = labels[val_mask, :]
    y_test[test_mask, :] = labels[test_mask, :]

    adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj)

    return adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask, train_size, test_size


def sparse_to_tuple(sparse_mx):
    """Convert sparse matrix to tuple representation."""
    def to_tuple(mx):
        if not sp.isspmatrix_coo(mx):
            mx = mx.tocoo()
        coords = np.vstack((mx.row, mx.col)).transpose()
        values = mx.data
        shape = mx.shape
        return coords, values, shape

    if isinstance(sparse_mx, list):
        for i in range(len(sparse_mx)):
            sparse_mx[i] = to_tuple(sparse_mx[i])
    else:
        sparse_mx = to_tuple(sparse_mx)

    return sparse_mx


def preprocess_features(features):
    """Row-normalize feature matrix and convert to tuple representation"""
    rowsum = np.array(features.sum(1))
    r_inv = np.power(rowsum, -1).flatten()
    r_inv[np.isinf(r_inv)] = 0.
    r_mat_inv = sp.diags(r_inv)
    features = r_mat_inv.dot(features)
    
    return sparse_to_tuple(features)



def normalize_adj(adj):
    """Symmetrically normalize adjacency matrix."""
    adj = sp.coo_matrix(adj)
    rowsum = np.array(adj.sum(1))
    d_inv_sqrt = np.power(rowsum, -0.5).flatten()
    d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.
    d_mat_inv_sqrt = sp.diags(d_inv_sqrt)
    
    return adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt).tocoo()


def preprocess_adj(adj):
    """Preprocessing of adjacency matrix for simple GCN model and conversion to tuple representation."""
    adj_normalized = normalize_adj(adj + sp.eye(adj.shape[0]))
    
    return sparse_to_tuple(adj_normalized)



def construct_feed_dict(features, support, labels, labels_mask, placeholders):
    """Construct feed dictionary."""
    feed_dict = dict()
    feed_dict.update({placeholders['labels']: labels})
    feed_dict.update({placeholders['labels_mask']: labels_mask})
    feed_dict.update({placeholders['features']: features})
    feed_dict.update({placeholders['support'][i]: support[i]
                      for i in range(len(support))})
    feed_dict.update({placeholders['num_features_nonzero']: features[1].shape})
    return feed_dict


def loadWord2Vec(filename):
    """Read Word Vectors"""
    vocab = []
    embd = []
    word_vector_map = {}
    file = open(filename, 'r')
    for line in file.readlines():
        row = line.strip().split(' ')
        if(len(row) > 2):
            vocab.append(row[0])
            vector = row[1:]
            length = len(vector)
            for i in range(length):
                vector[i] = float(vector[i])
            embd.append(vector)
            word_vector_map[row[0]] = vector
    
    print('Loaded Word Vectors!')
    file.close()
    return vocab, embd, word_vector_map


def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

def uniform(shape, scale=0.05, name=None):
    """Uniform init."""
    initial = tf.random.uniform(shape, minval=-scale, maxval=scale, dtype=tf.float64)
    return tf.Variable(initial, name=name)


def glorot(shape, name=None):
    """Glorot & Bengio (AISTATS 2010) init."""
    init_range = np.sqrt(6.0/(shape[0]+shape[1]))
    initial = tf.random.uniform(shape, minval=-init_range, maxval=init_range, dtype=tf.float64)
    return tf.Variable(initial, name=name)


def zeros(shape, name=None):
    """All zeros."""
    initial = tf.zeros(shape, dtype=tf.float64)
    return tf.Variable(initial, name=name)


def ones(shape, name=None):
    """All ones."""
    initial = tf.ones(shape, dtype=tf.float64)
    return tf.Variable(initial, name=name)


# global unique layer ID dictionary for layer name assignment
_LAYER_UIDS = {}


def get_layer_uid(layer_name=''):
    """Helper function, assigns unique layer IDs."""
    if layer_name not in _LAYER_UIDS:
        _LAYER_UIDS[layer_name] = 1
        return 1
    else:
        _LAYER_UIDS[layer_name] += 1
        return _LAYER_UIDS[layer_name]


def sparse_dropout(x, rate, noise_shape):
    """
    Dropout for sparse tensors.
    """
    random_tensor = rate
    random_tensor += tf.random.uniform(noise_shape)
    dropout_mask = tf.cast(tf.floor(random_tensor), dtype=tf.bool)
    pre_out = tf.sparse.retain(x, dropout_mask)
    return pre_out * (1./(rate))


def dot(x, y, sparse=False):
    """
    Wrapper for tf.matmul (sparse vs dense).
    """
    if sparse:
        res = tf.sparse.sparse_dense_matmul(x, y)
    else:
        res = tf.matmul(x, y)
    return res


In [53]:
adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask, train_size, test_size = load_corpus(dataset)

features = sp.identity(features.shape[0])  # featureless
features = preprocess_features(features)

In [54]:
support = [preprocess_adj(adj)]

t_features = tf.SparseTensor(*features)
t_y_train = tf.convert_to_tensor(y_train)
t_y_val = tf.convert_to_tensor(y_val)
t_y_test = tf.convert_to_tensor(y_test)
tm_train_mask = tf.convert_to_tensor(train_mask)

tm_val_mask = tf.convert_to_tensor(val_mask)
tm_test_mask = tf.convert_to_tensor(test_mask)

t_support = []
for i in range(len(support)):
    t_support.append(tf.cast(tf.SparseTensor(*support[i]), dtype=tf.float64))

In [55]:
class GraphConvolution(layers.Layer):
    """
    Graph convolution layer.
    """
    def __init__(self, input_dim, output_dim, num_features_nonzero,
                 dropout=0.,
                 is_sparse_inputs=False,
                 activation=tf.nn.relu,
                 bias=False,
                 featureless=False, **kwargs):
        super(GraphConvolution, self).__init__(**kwargs)

        self.dropout = dropout
        self.activation = activation
        self.is_sparse_inputs = is_sparse_inputs
        self.featureless = featureless
        self.bias = bias
        self.num_features_nonzero = num_features_nonzero
        self.embedding = None

        self.weights_ = []
        for i in range(1):
            w = self.add_variable('weight' + str(i), [input_dim, output_dim])
            self.weights_.append(w)
        if self.bias:
            self.bias = self.add_variable('bias', [output_dim])




    def call(self, inputs, training=None):
        x, support_ = inputs

        # dropout
        if training is not False and self.is_sparse_inputs:
            x = sparse_dropout(x, self.dropout, self.num_features_nonzero)
        elif training is not False:
            x = tf.nn.dropout(x, self.dropout)


        # convolve
        supports = list()
        for i in range(len(support_)):
            if not self.featureless: # if it has features x
                pre_sup = dot(x, self.weights_[i], sparse=self.is_sparse_inputs)
            else:
                pre_sup = self.weights_[i]

            support = dot(support_[i], pre_sup, sparse=True)
            supports.append(support)

        output = tf.add_n(supports)

        # bias
        if self.bias:
            output += self.bias
        
        self.embedding = output # for visualization
        return self.activation(output)


In [56]:
class GCN(keras.Model):

    def __init__(self, input_dim, output_dim, num_features_nonzero, **kwargs):
        super(GCN, self).__init__(**kwargs)

        self.input_dim = input_dim # 1433
        self.output_dim = output_dim

        print('input  dim: ', input_dim)
        print('output dim: ', output_dim)

        self.layers_ = []
        self.layers_.append(GraphConvolution(input_dim=self.input_dim, # 1433
                                            output_dim=hidden1, # 16
                                            num_features_nonzero=num_features_nonzero,
                                            activation=tf.nn.relu,
                                            dropout=dropout,
                                            is_sparse_inputs=True))


        self.layers_.append(GraphConvolution(input_dim=hidden1, # 16
                                            output_dim=self.output_dim, # 7
                                            num_features_nonzero=num_features_nonzero,
                                            activation=lambda x: x,
                                            dropout=dropout))


            
    def call(self, inputs, training=None):
        x, label, mask, support = inputs

        outputs = [x]

        for layer in self.layers:
            hidden = layer((outputs[-1], support), training)
            outputs.append(hidden)
        output = outputs[-1]

        # # Weight decay loss
        loss = tf.zeros([])
        for var in self.layers_[0].trainable_variables:
            loss +=weight_decay * tf.nn.l2_loss(var)

        # Cross entropy error
        loss += masked_softmax_cross_entropy(output, label, mask)

        acc = masked_accuracy(output, label, mask)

        return tf.argmax(output, 1), loss, acc



    def predict(self):
        return tf.nn.softmax(self.outputs)


In [57]:
# Create model
model = GCN(input_dim=features[2][1], output_dim=y_train.shape[1], num_features_nonzero=features[1].shape)



# Loss and optimizer
optimizer = optimizers.Adam(learning_rate=learning_rate)

cost_val = []


input  dim:  15362
output dim:  8


In [58]:
for epoch in range(epochs):
    
    t = time.time()
    with tf.GradientTape() as tape:
        _, loss, acc = model((t_features, t_y_train, tm_train_mask, t_support))
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

    _, val_loss, val_acc = model((t_features, t_y_val, tm_val_mask, t_support), training=False)
    cost_val.append(val_loss)
    
    print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(loss),
          "train_acc=", "{:.5f}".format(acc), "val_loss=", "{:.5f}".format(val_loss),
          "val_acc=", "{:.5f}".format(val_acc), "time=", "{:.5f}".format(time.time() - t))
    
    if epoch > early_stopping and cost_val[-1] > np.mean(cost_val[-(early_stopping+1):-1]):
        print("Early stopping...")
        break

Epoch: 0001 train_loss= 2.07943 train_acc= 0.08446 val_loss= 2.06114 val_acc= 0.65328 time= 5.23540
Epoch: 0002 train_loss= 2.06178 train_acc= 0.65445 val_loss= 2.02133 val_acc= 0.70073 time= 5.06043
Epoch: 0003 train_loss= 2.02285 train_acc= 0.66377 val_loss= 1.95837 val_acc= 0.72080 time= 4.92645
Epoch: 0004 train_loss= 1.96015 train_acc= 0.70164 val_loss= 1.87269 val_acc= 0.73723 time= 5.21167
Epoch: 0005 train_loss= 1.87801 train_acc= 0.70144 val_loss= 1.76673 val_acc= 0.74453 time= 4.96838
Epoch: 0006 train_loss= 1.77486 train_acc= 0.71926 val_loss= 1.64652 val_acc= 0.74453 time= 5.27725
Epoch: 0007 train_loss= 1.66752 train_acc= 0.71562 val_loss= 1.52308 val_acc= 0.73905 time= 5.01324
Epoch: 0008 train_loss= 1.54614 train_acc= 0.70630 val_loss= 1.40879 val_acc= 0.73723 time= 5.09187
Epoch: 0009 train_loss= 1.43603 train_acc= 0.71582 val_loss= 1.31238 val_acc= 0.72080 time= 5.02987
Epoch: 0010 train_loss= 1.35458 train_acc= 0.69354 val_loss= 1.23460 val_acc= 0.68066 time= 5.32537


Epoch: 0083 train_loss= 0.08204 train_acc= 0.97914 val_loss= 0.12249 val_acc= 0.96898 time= 5.29085
Epoch: 0084 train_loss= 0.08427 train_acc= 0.97792 val_loss= 0.12214 val_acc= 0.96898 time= 5.04601
Epoch: 0085 train_loss= 0.08151 train_acc= 0.98096 val_loss= 0.12120 val_acc= 0.97080 time= 5.20162
Epoch: 0086 train_loss= 0.07606 train_acc= 0.98096 val_loss= 0.11979 val_acc= 0.97080 time= 4.71493
Epoch: 0087 train_loss= 0.07873 train_acc= 0.97995 val_loss= 0.11808 val_acc= 0.97080 time= 4.83650
Epoch: 0088 train_loss= 0.07633 train_acc= 0.98319 val_loss= 0.11657 val_acc= 0.97263 time= 4.91131
Epoch: 0089 train_loss= 0.07479 train_acc= 0.98076 val_loss= 0.11627 val_acc= 0.97263 time= 4.96424
Epoch: 0090 train_loss= 0.07150 train_acc= 0.98197 val_loss= 0.11465 val_acc= 0.97080 time= 5.24305
Epoch: 0091 train_loss= 0.06709 train_acc= 0.98298 val_loss= 0.11282 val_acc= 0.96898 time= 5.26377
Epoch: 0092 train_loss= 0.06470 train_acc= 0.98440 val_loss= 0.11152 val_acc= 0.96533 time= 4.84557


In [59]:
def evaluate(features, y, mask, support):
    t = time.time()
    
    pred, test_loss, test_acc = model((features, y, mask, support), training=False)
    
    
    return test_loss, test_acc, pred, np.argmax(y, axis=1), time.time() - t


test_cost, test_acc, pred, labels, test_duration = evaluate(t_features, t_y_test, tm_test_mask, t_support)
print("Test set results:", "cost=", "{:.5f}".format(test_cost), "accuracy=", "{:.5f}".format(test_acc), "time=", "{:.5f}".format(test_duration))


test_pred = []
test_labels = []

for i in range(len(test_mask)):
    if test_mask[i]:
        test_pred.append(pred[i])
        test_labels.append(labels[i])

print("Average Test Precision, Recall and F1-Score...")
print(metrics.precision_recall_fscore_support(test_labels, test_pred, average='micro'))


Test set results: cost= 0.11332 accuracy= 0.97259 time= 1.04790
Average Test Precision, Recall and F1-Score...
(0.9725902238465053, 0.9725902238465053, 0.9725902238465053, None)
