In [1]:
from __future__ import division
from __future__ import print_function

import time
import os

# Train on CPU (hide GPU) due to memory constraints
os.environ['CUDA_VISIBLE_DEVICES'] = "-1"

import tensorflow as tf
import numpy as np
import scipy.sparse as sp

from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score

from optimizer import OptimizerAE, OptimizerVAE
from input_data import load_data
from model import GCNModelAE, GCNModelVAE
from preprocessing import preprocess_graph, construct_feed_dict, sparse_to_tuple, mask_test_edges

def del_all_flags(FLAGS):
    flags_dict = FLAGS._flags()    
    keys_list = [keys for keys in flags_dict]    
    for keys in keys_list:
        FLAGS.__delattr__(keys)

del_all_flags(tf.flags.FLAGS)


# Settings
flags = tf.app.flags
FLAGS = flags.FLAGS
flags.DEFINE_string('f', '', 'kernel')
flags.DEFINE_float('learning_rate', 0.01, 'Initial learning rate.')
flags.DEFINE_integer('epochs', 2000, 'Number of epochs to train.')
flags.DEFINE_integer('hidden1', 64, 'Number of units in hidden layer 1.')
flags.DEFINE_integer('hidden2', 16, 'Number of units in hidden layer 2.')
flags.DEFINE_float('weight_decay', 0., 'Weight for L2 loss on embedding matrix.')
flags.DEFINE_float('dropout', 0., 'Dropout rate (1 - keep probability).')

flags.DEFINE_string('model', 'gcn_ae', 'Model string.')
flags.DEFINE_string('dataset', 'yale', 'Dataset string.')
flags.DEFINE_integer('features', 0, 'Whether to use features (1) or not (0).')

model_str = FLAGS.model
#dataset_str = FLAGS.dataset

dataset_str = 'ny'
dim = FLAGS.hidden2

# Load data
adj, features = load_data(dataset_str)

# Store original adjacency matrix (without diagonal entries) for later
adj_orig = adj
adj_orig = adj_orig - sp.dia_matrix((adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape)
adj_orig.eliminate_zeros()

adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(adj)


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
adj.shape[0]

44371

In [3]:
# np.save('./data/roch_train_edges.npy',train_edges)
# np.save('./data/roch_val_edges.npy',val_edges)
# np.save('./data/roch_val_edges_false.npy',val_edges_false)
# np.save('./data/roch_test_edges.npy',test_edges)
# np.save('./data/roch_test_edges_false.npy',test_edges_false)

In [4]:
#adj = adj_train
adj_train = adj
if FLAGS.features == 0:
    features = sp.identity(adj.shape[0])  # featureless

# Some preprocessing
adj_norm = preprocess_graph(adj)

# Define placeholders
placeholders = {
    'features': tf.sparse_placeholder(tf.float32),
    'adj': tf.sparse_placeholder(tf.float32),
    'adj_orig': tf.sparse_placeholder(tf.float32),
    'dropout': tf.placeholder_with_default(0., shape=())
}

num_nodes = adj.shape[0]

features = sparse_to_tuple(features.tocoo())
num_features = features[2][1]
features_nonzero = features[1].shape[0]

# Create model
model = None
if model_str == 'gcn_ae':
    model = GCNModelAE(placeholders, num_features, features_nonzero)
elif model_str == 'gcn_vae':
    model = GCNModelVAE(placeholders, num_features, num_nodes, features_nonzero)

pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum()
norm = adj.shape[0] * adj.shape[0] / float((adj.shape[0] * adj.shape[0] - adj.sum()) * 2)

# Optimizer
with tf.name_scope('optimizer'):
    if model_str == 'gcn_ae':
        opt = OptimizerAE(preds=model.reconstructions,
                          labels=tf.reshape(tf.sparse_tensor_to_dense(placeholders['adj_orig'],
                                                                      validate_indices=False), [-1]),
                          model=model,
                          pos_weight=pos_weight,
                          norm=norm)
    elif model_str == 'gcn_vae':
        opt = OptimizerVAE(preds=model.reconstructions,
                           labels=tf.reshape(tf.sparse_tensor_to_dense(placeholders['adj_orig'],
                                                                       validate_indices=False), [-1]),
                           model=model, num_nodes=num_nodes,
                           pos_weight=pos_weight,
                           norm=norm)

# Initialize session
sess = tf.Session()
sess.run(tf.global_variables_initializer())

cost_val = []
acc_val = []


def get_roc_score(edges_pos, edges_neg, emb=None):
    if emb is None:
        feed_dict.update({placeholders['dropout']: 0})
        emb = sess.run(model.z_mean, feed_dict=feed_dict)

    def sigmoid(x):
        return 1 / (1 + np.exp(-x))

    # Predict on test set of edges
    adj_rec = np.dot(emb, emb.T)
    preds = []
    pos = []
    for e in edges_pos:
        preds.append(sigmoid(adj_rec[e[0], e[1]]))
        pos.append(adj_orig[e[0], e[1]])

    preds_neg = []
    neg = []
    for e in edges_neg:
        preds_neg.append(sigmoid(adj_rec[e[0], e[1]]))
        neg.append(adj_orig[e[0], e[1]])

    preds_all = np.hstack([preds, preds_neg])
    labels_all = np.hstack([np.ones(len(preds)), np.zeros(len(preds_neg))])
    roc_score = roc_auc_score(labels_all, preds_all)
    ap_score = average_precision_score(labels_all, preds_all)

    return roc_score, ap_score


cost_val = []
acc_val = []
val_roc_score = []

adj_label = adj_train + sp.eye(adj_train.shape[0])
adj_label = sparse_to_tuple(adj_label)

# Train model
for epoch in range(FLAGS.epochs):

    t = time.time()
    # Construct feed dictionary
    feed_dict = construct_feed_dict(adj_norm, adj_label, features, placeholders)
    feed_dict.update({placeholders['dropout']: FLAGS.dropout})
    # Run single weight update
    outs = sess.run([opt.opt_op, opt.cost, opt.accuracy,opt.z], feed_dict=feed_dict)

    # Compute average loss
    avg_cost = outs[1]
    avg_accuracy = outs[2]
    if epoch%1 == 0:
        roc_curr, ap_curr = get_roc_score(val_edges, val_edges_false)
        val_roc_score.append(roc_curr)

        print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(avg_cost),
              "train_acc=", "{:.5f}".format(avg_accuracy), "val_roc=", "{:.5f}".format(val_roc_score[-1]),
              "val_ap=", "{:.5f}".format(ap_curr),
              "time=", "{:.5f}".format(time.time() - t))

print("Optimization Finished!")

roc_score, ap_score = get_roc_score(test_edges, test_edges_false)
print('Test ROC score: ' + str(roc_score))
print('Test AP score: ' + str(ap_score))

outs = sess.run(opt.z, feed_dict=feed_dict)

Epoch: 0001 train_loss= 0.39732 train_acc= 0.03282 val_roc= 0.70377 val_ap= 0.66604 time= 42.24061
Epoch: 0002 train_loss= 0.39732 train_acc= 0.01811 val_roc= 0.75387 val_ap= 0.72467 time= 38.65318
Epoch: 0003 train_loss= 0.39732 train_acc= 0.00910 val_roc= 0.78698 val_ap= 0.78191 time= 40.89233
Epoch: 0004 train_loss= 0.39731 train_acc= 0.00524 val_roc= 0.79541 val_ap= 0.80543 time= 40.98866
Epoch: 0005 train_loss= 0.39718 train_acc= 0.00402 val_roc= 0.78881 val_ap= 0.80647 time= 40.18979
Epoch: 0006 train_loss= 0.39697 train_acc= 0.00411 val_roc= 0.78076 val_ap= 0.80457 time= 38.04920


KeyboardInterrupt: 

In [None]:
outs = sess.run(opt.z, feed_dict=feed_dict)
np.save('../../data/'+dataset_str+'_d'+str(dim)+'_emb.npy',outs)

In [None]:
import pickle as pkl
import scipy.sparse as sp
import tensorflow as tf
import numpy as np
#dataname = 'yale_add_rm_40_re_2111'
with open('/home/SharedStorage2/yhuang24/KaiyangLi/my_node_match/data/'+dataset_str+'_adj.pkl', 'rb') as f1:
#with open('/home/SharedStorage2/yhuang24/KaiyangLi/my_node_match/data/yale_add_rm_40_adj.pkl', 'rb') as f1:
    adj = pkl.load(f1)

def sparse_to_tuple(sparse_mx):
    if not sp.isspmatrix_coo(sparse_mx):
        sparse_mx = sparse_mx.tocoo()
    coords = np.vstack((sparse_mx.row, sparse_mx.col)).transpose()
    values = sparse_mx.data
    shape = sparse_mx.shape
    return coords, values, shape
coords, values, shape = sparse_to_tuple(adj)
np.savetxt('/home/SharedStorage2/yhuang24/KaiyangLi/my_node_match/data/'+dataset_str+'_edges.txt',coords,fmt='%d')

In [None]:
import numpy as np
import pickle as pkl
import scipy.sparse as sp
def sort_by_value(d):
    items=d.items()
    backitems=[[v[1],v[0]] for v in items]
    backitems.sort(reverse = True)
#    print(backitems)
    return [ backitems[i][1] for i in range(0,len(backitems))]
#dataset = "amherst_addDel_new"
#dataset = "yale_add_rm_40_re_2111"
#dataset = "yale_add_rm_40"
#dim = "d16"
#edges_positive2 = np.loadtxt('/data/lky/MUSE/data/'+dataset+'_edges.txt')
edges_positive2 = np.loadtxt('/home/SharedStorage2/yhuang24/KaiyangLi/my_node_match/data/'+dataset_str+'_edges.txt')
edges_positive2 = edges_positive2.flatten()
import collections
roch_dict_cnt = collections.Counter(edges_positive2)
roch_dict_cnt = sort_by_value(roch_dict_cnt)
roch_dict_cnt = [ int(x) for x in roch_dict_cnt]
import numpy as np
src_np = np.load('/home/SharedStorage2/yhuang24/KaiyangLi/my_node_match/data/'+dataset_str+'_d'+str(dim)+'_emb.npy')
size_src = src_np.shape
print(size_src)


all_id = list(range(size_src[0]))
id_remain = list(set(all_id).difference(set(roch_dict_cnt)))
if id_remain != []:
    src_np = src_np[roch_dict_cnt+id_remain]
    id_src = np.array(roch_dict_cnt+id_remain)
else:
    src_np = src_np[roch_dict_cnt]
    id_src = np.array(roch_dict_cnt)
    
        
id_src = id_src.reshape((size_src[0],1))

src_np = np.hstack((id_src,src_np))


np.savetxt('/home/SharedStorage2/yhuang24/KaiyangLi/my_node_match/data/'+dataset_str+'_d'+str(dim)+'_emb.vec',src_np,fmt='%.8f')
f = open('/home/SharedStorage2/yhuang24/KaiyangLi/my_node_match/data/'+dataset_str+'_d'+str(dim)+'_emb.vec', 'r+')
content = f.read()
f.seek(0, 0)
f.write(str(size_src[0]) +' '+ str(size_src[1]) + '\n' + content)
f.close()


In [None]:
dataset_str