In [None]:
## Training the GRN 
from train import *
from utils import *
from data import *

args = Args()
os.environ['CUDA_VISIBLE_DEVICES'] = str(args.cuda)
print('CUDA', args.cuda)

def test_graph(epoch, args, rnn, output, graphs_test, test_batch_size):
    number_of_subgraphs = len(graphs_test)
    print(number_of_subgraphs)

    iter = 0
    G_pred_final = []
    while iter<number_of_subgraphs:

        rnn.hidden = rnn.init_hidden(args.batch_size)
        rnn.eval()
        output.eval()

        num_of_nodes = len(graphs_test[iter].nodes())
        atts = nx.get_node_attributes(graphs_test[iter],'feature')
        atts_array = [value for _,value in atts.items() ]; 

        y_pred_long = Variable(torch.zeros(args.test_batch_size, args.max_num_node, args.max_prev_node+len(atts_array[0]))).cuda() # discrete prediction

        x_step = np.zeros((args.test_batch_size,1,args.max_prev_node+len(atts_array[0])))
        x_step[:,:,0] = 1
        x_step[:,:,args.max_prev_node:] = atts_array[0]
        # x_step = Variable(torch.ones(args.test_single_batch_size,1,)).cuda()
        x_step = torch.from_numpy(x_step).float()
        x_step = Variable(x_step).cuda()
        for i in range(num_of_nodes):
            h = rnn(x_step)
            # output.hidden = h.permute(1,0,2)
            hidden_null = Variable(torch.zeros(args.num_layers - 1, h.size(0), h.size(2))).cuda()
            output.hidden = torch.cat((h.permute(1,0,2), hidden_null),
                                      dim=0)  # num_layers, batch_size, hidden_size
            x_step = Variable(torch.zeros(args.test_batch_size,1,args.max_prev_node+len(atts_array[0]))).cuda()
            output_x_step = Variable(torch.ones(args.test_batch_size,1,1)).cuda()
            for j in range(min(args.max_prev_node+len(atts_array[0]),i+1)):
                output_y_pred_step = output(output_x_step)
                output_x_step = sample_sigmoid(output_y_pred_step, sample=True, sample_time=1)
                x_step[:,:,j:j+1] = output_x_step
                output.hidden = Variable(output.hidden.data).cuda()
            y_pred_long[:, i:i + 1, :] = x_step
            rnn.hidden = Variable(rnn.hidden.data).cuda()
        y_pred_long_data = y_pred_long.data.long()

        # save graphs as pickle
        G_pred_list = []
        for i in range(args.test_single_batch_size):
            adj_pred = decode_adj(y_pred_long_data[i].cpu().numpy())
            G_pred = get_graph(adj_pred) # get a graph from zero-padded adj
            G_pred_list.append(G_pred)
        G_pred_final.extend(G_pred_list)
        iter += 1
    print(len(G_pred_final))
    # save graphs
    fname = args.graph_save_path + args.fname_pred + str(epoch) + '.dat'
    print(fname)
    save_graph_list(G_pred_final, fname)

graphs = create_graphs.create(args)
# split datasets
random.seed(123)
shuffle(graphs)
graphs_len = len(graphs)
graphs_test = graphs[int(0.8 * graphs_len):]
graphs_train = graphs[0:int(0.8*graphs_len)]
graphs_validate = graphs[0:int(0.2*graphs_len)]


args.max_num_node = max([graphs[i].number_of_nodes() for i in range(len(graphs))])
max_num_edge = max([graphs[i].number_of_edges() for i in range(len(graphs))])
min_num_edge = min([graphs[i].number_of_edges() for i in range(len(graphs))])

save_graph_list(graphs, args.graph_save_path + args.fname_train + '0.dat')
save_graph_list(graphs, args.graph_save_path + args.fname_test + '0.dat')


dataset = Graph_sequence_sampler_pytorch(graphs_train,max_prev_node=args.max_prev_node,max_num_node=args.max_num_node)
sample_strategy = torch.utils.data.sampler.WeightedRandomSampler([1.0 / len(dataset) for i in range(len(dataset))],
                                                                 num_samples=args.batch_size*args.batch_ratio, replacement=True)
dataset_loader = torch.utils.data.DataLoader(dataset, batch_size=args.batch_size, num_workers=args.num_workers,
                                           sampler=sample_strategy)
atts = nx.get_node_attributes(graphs[0],'feature')
atts_array = [value for _,value in atts.items() ]; 
# print()
rnn = GRU_plain(input_size=args.max_prev_node+len(atts_array[0]), embedding_size=args.embedding_size_rnn,
                hidden_size=args.hidden_size_rnn, num_layers=args.num_layers, has_input=True,
                has_output=True, output_size=args.hidden_size_rnn_output).cuda()
output = GRU_plain(input_size=1, embedding_size=args.embedding_size_rnn_output,
                   hidden_size=args.hidden_size_rnn_output, num_layers=args.num_layers, has_input=True,
                   has_output=True, output_size=1).cuda()
epoch = 1

# initialize optimizer
optimizer_rnn = optim.Adam(list(rnn.parameters()), lr=args.lr)
optimizer_output = optim.Adam(list(output.parameters()), lr=args.lr)

scheduler_rnn = MultiStepLR(optimizer_rnn, milestones=args.milestones, gamma=args.lr_rate)
scheduler_output = MultiStepLR(optimizer_output, milestones=args.milestones, gamma=args.lr_rate)
time_all = np.zeros(args.epochs)
while epoch<=args.epochs:
    time_start = tm.time()
    rnn.train() #sets the model on train mode
    output.train()
    loss_sum = 0
    print(epoch)
    for batch_idx, data in enumerate(dataset_loader):
#         if batch_idx == 13:
#             break
        rnn.zero_grad()
        output.zero_grad()
        x_unsorted = data['x'].float()
        y_unsorted = data['y'].float()
        att_unsorted = data['att'].float()
        label_unsorted = data['label'] 
        y_len_unsorted = data['len']
        y_len_max = max(y_len_unsorted)
        x_unsorted = x_unsorted[:, 0:y_len_max, :]
        y_unsorted = y_unsorted[:, 0:y_len_max, :]
        att_unsorted = att_unsorted[:, 0:y_len_max, :]
        label_unsorted = label_unsorted[:,0:y_len_max]
#         print('x: ', x_unsorted)
#         print('att: ', att_unsorted)
#         print('label: ',label_unsorted)

        # initialize lstm hidden state according to batch size
        rnn.hidden = rnn.init_hidden(batch_size=x_unsorted.size(0))
        # output.hidden = output.init_hidden(batch_size=x_unsorted.size(0)*x_unsorted.size(1))

        # sort input
        y_len,sort_index = torch.sort(y_len_unsorted,0,descending=True)
        y_len = y_len.numpy().tolist()
        x = torch.index_select(x_unsorted,0,sort_index)
        y = torch.index_select(y_unsorted,0,sort_index)
        att = torch.index_select(att_unsorted,0,sort_index)
        label = torch.index_select(label_unsorted,0,sort_index)
#         print('y_len: ',y_len)
#         print('x: ',x)
#         print('y: ',y)
#         print('att: ', att)
#         print('label: ', label)

        y_reshape = pack_padded_sequence(y,y_len,batch_first=True).data
#         print(y_reshape)
        
        # reverse y_reshape, so that their lengths are sorted, add dimension
        idx = [i for i in range(y_reshape.size(0)-1, -1, -1)]
        idx = torch.LongTensor(idx) # selects the indexes with ind = idx 
        y_reshape = y_reshape.index_select(0, idx) # selects the the elements from 0'th dimension (rows) with index = idx
        y_reshape = y_reshape.view(y_reshape.size(0),y_reshape.size(1),1) # Do the same as reshape
        output_x = torch.cat((torch.ones(y_reshape.size(0),1,1),y_reshape[:,0:-1,0:1]),dim=1)
        output_y = y_reshape
        # batch size for output module: sum(y_len)
        output_y_len = []
        output_y_len_bin = np.bincount(np.array(y_len))
        for i in range(len(output_y_len_bin)-1,0,-1):
            count_temp = np.sum(output_y_len_bin[i:]) # count how many y_len is above i
            output_y_len.extend([min(i,y.size(2))]*count_temp) # put them in output_y_len; max value should not exceed y.size(2)
        # pack into variable
        x = Variable(x).cuda()
        y = Variable(y).cuda()
        att = Variable(att).cuda()
        label = Variable(label).cuda()
        output_x = Variable(output_x).cuda()
        output_y = Variable(output_y).cuda()
        # print(output_y_len)
        # print('len',len(output_y_len))
        # print('y',y.size())
        # print('output_y',output_y.size())
#         print(x[1])
        # if using ground truth to train
        h = rnn(x, pack=True, input_len=y_len)
        h = pack_padded_sequence(h,y_len,batch_first=True).data # get packed hidden vector
        # reverse h
        idx = [i for i in range(h.size(0) - 1, -1, -1)]
        idx = Variable(torch.LongTensor(idx)).cuda()
        h = h.index_select(0, idx)
        hidden_null = Variable(torch.zeros(args.num_layers-1, h.size(0), h.size(1))).cuda()
        output.hidden = torch.cat((h.view(1,h.size(0),h.size(1)),hidden_null),dim=0) # num_layers, batch_size, hidden_size
        y_pred = output(output_x, pack=True, input_len=output_y_len)
        y_pred = F.sigmoid(y_pred)
        # clean
        y_pred = pack_padded_sequence(y_pred, output_y_len, batch_first=True)
        y_pred = pad_packed_sequence(y_pred, batch_first=True)[0]
        output_y = pack_padded_sequence(output_y,output_y_len,batch_first=True)
        output_y = pad_packed_sequence(output_y,batch_first=True)[0]
        # use cross entropy loss
        loss = binary_cross_entropy_weight(y_pred, output_y)
        loss.backward()
        # update deterministic and lstm
        optimizer_output.step()
        optimizer_rnn.step()
        scheduler_output.step()
        scheduler_rnn.step()


        if epoch % args.epochs_log==0 and batch_idx==0: # only output first batch's statistics
            print('Epoch: {}/{}, train loss: {:.6f}, graph type: {}, num_layer: {}, hidden: {}'.format(
                epoch, args.epochs,loss.data, args.graph_type, args.num_layers, args.hidden_size_rnn))

        # logging
#         log_value('loss_'+args.fname, loss.data, epoch*args.batch_ratio+batch_idx)
        feature_dim = y.size(1)*y.size(2)
        loss_sum += loss.data*feature_dim
    print(loss_sum/(batch_idx+1))
#         print('x_output: ', output_x)
#         print('x: ', x)
#         print('x_output: ', output_x)
    time_end = tm.time()
    time_all[epoch - 1] = time_end - time_start
    # test
    if epoch % args.epochs_test == 0 and epoch>=args.epochs_test_start:
        test_graph(epoch, args, rnn, output, graphs_test, test_batch_size=args.test_batch_size)
#         fname = args.graph_save_path + args.fname_pred + str(epoch) +'_'+str(sample_time) + '.dat'
#         save_graph_list(G_pred, fname)
        print('test done, graphs saved')
    epoch += 1



#     print(y_unsorted.size(0), ' ',y_unsorted.size(1))
#     print(x_unsorted.size(1),' ',x_unsorted.size(1))
#     if epoch == 2:
#         break


In [None]:
# Testing the GRN and generate the test graph
import argparse
import numpy as np
import os
import re
from random import shuffle
import eval.stats
import utils
from args import Args
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt
from utils import *
import matplotlib
import csv


args = Args()
pred_dir_input = "./"+"modified_graphs/" # args.dir_input + "/graphs/" 
test_dir_input = "./" + "modified_graphs/"
model_name = 'GraphRNN_RNN' # self.model_name_all --> evaluation.py
pred_dataset_name = 'amazon' # self.dataset_name_all --> evaluation.py 
test_dataset_name = 'amazon' # self.dataset_name_all --> evaluation.py 
hidden = 128 # evaluate.py -> evaluation -> evaluation_epoch
epochs = np.arange(100,3100,100) # for evaluating the performance of improved GraphRNN vs base GraphRNN 
# epochs = 
sample_time = 1 # only one, because we use GRAPHRNN_RNN --> evaluate.p -> evaluation -> evaluation_epoch
node_accuracy = []
edge_accuracy = []
acc_shift = 0.05
args.max_num_node = 15
max_edges = 25
for epoch in epochs:
    fname_test = test_dir_input + model_name + '_' + test_dataset_name + '_' + str(args.num_layers) + '_' + str(hidden) + '_test_' + str(0) + '.dat'
#     fname_pred = test_dir_input + model_name + '_' + pred_dataset_name + '_' + str(args.num_layers) + '_' + str(hidden) + '_pred_' + str(epoch) + '_' +  str(sample_time)+'.dat' # for base
    fname_pred = pred_dir_input + model_name + '_' + pred_dataset_name + '_' + str(args.num_layers) + '_' + str(hidden) + '_pred_' + str(epoch) + '.dat' # for improved




    graph_test = utils.load_graph_list(fname_test,is_real=True)
    graph_pred = utils.load_graph_list(fname_pred,is_real=False)
    graph_test_len = len(graph_test)
    graph_train = graph_test[0:int(0.8 * graph_test_len)] # train
    graph_validate = graph_test[0:int(0.2 * graph_test_len)] # validate
    graph_test = graph_test[int(0.8 * graph_test_len):] # test on a hold out test set

    ## CLEAN GRAPH
    def find_nearest_idx(array,value):
        idx = (np.abs(array-value)).argmin()
        return idx

    # shuffle(graph_test)
    # shuffle(graph_pred)

    # get length
    real_graph_len = np.array([len(graph_test[i]) for i in range(len(graph_test))])
    pred_graph_len = np.array([len(graph_pred[i]) for i in range(len(graph_pred))])
    # print(real_graph_len,len(pred_graph_len))

    # select pred samples
    # The number of nodes are sampled from the similar distribution as the training set
    pred_graph_new = []
    pred_graph_len_new = []
    for value in real_graph_len:
        pred_idx = find_nearest_idx(pred_graph_len, value)
        pred_graph_new.append(graph_pred[pred_idx])
        pred_graph_len_new.append(pred_graph_len[pred_idx])

    graph_pred = pred_graph_new
    graph_real = graph_test
    # min_len = min(len(graph_test),len(graph_pred))

    pred_to_real_dic = {}
    node_matches = 0
    edge_matches = 0
    total_edges = 0
    graph_pred_adj = []

    total_nodes_len_list = []
    total_edges_len_list = []
    for idx in range(len(graph_real)):
        total_nodes_len_list.append(len(graph_test[idx].nodes()))
        total_edges_len_list.append(len(graph_test[idx].edges()))


    true_node_groups = [0 for _ in range(args.max_num_node)]
    total_node_groups = [0 for _ in range(args.max_num_node)]

    true_edge_groups = [0 for _ in range(max_edges)]
    total_edge_groups = [0 for _ in range(max_edges)]

    for idx in range(len(graph_pred)):
        real_nodes = graph_test[idx].nodes()
        pred_nodes = graph_pred[idx].nodes()
        real_edges = graph_test[idx].edges()
        pred_edges = graph_pred[idx].edges()
        if len(real_nodes) < args.max_num_node:
            total_node_groups[len(real_nodes)-1] += 1
    #     print(idx)
        temporal_dic = {}    
        edge_counter = 0
        for edge_idx, edge in enumerate(real_edges):
            if edge[0] not in temporal_dic:
                temporal_dic[edge[0]] = edge_counter
                edge_counter += 1
            if edge[1] not in temporal_dic:
                temporal_dic[edge[1]] = edge_counter 
                edge_counter += 1
        shortest_len = min(len(real_edges),len(pred_edges))

        temporal_dic_inverse = dict([(value, key) for key, value in temporal_dic.items()]) 
        pred_adj = []
        for edge_idx in range(shortest_len):
            real_edge = real_edges[edge_idx]
            pred_edge = pred_edges[edge_idx]
            if len(real_edges) < max_edges:
                total_edge_groups[len(real_edges)-1] += 1
                if pred_edge[0] == temporal_dic[real_edge[0]] and pred_edge[1] == temporal_dic[real_edge[1]]:
                    edge_matches += 1
                    true_edge_groups[len(real_edges)-1] += 1
            total_edges += 1
            pred_edge = list(pred_edge)
            if pred_edge[0] in temporal_dic and pred_edge[1] in temporal_dic:            
                pred_edge[0] = temporal_dic_inverse[pred_edge[0]]
                pred_edge[1] = temporal_dic_inverse[pred_edge[1]]
    #         pred_edges[edge_idx] = pred_edge
            pred_adj.append(tuple(pred_edge))
        graph_pred_adj.append(pred_adj)
        if len(real_nodes) == len(pred_nodes):
            node_matches += 1 # for improved
#         if len(real_nodes) + 1 == len(pred_nodes):    
#             true_node_groups[len(real_nodes)-1] += 1
#         if len(real_nodes) == len(pred_nodes) :
#             true_node_groups[len(real_nodes)-1] += 1
    #     if len(real_nodes) == len(pred_nodes) + 2:
    #         true_node_groups[len(real_nodes)-2] += 1

    #         node_matches += min(len(pred_nodes)/len(real_nodes), len(real_nodes)/len(pred_nodes)) # for base
    save_graph_list(graph_pred_adj, args.graph_save_path + args.fname_test_with_id + '0.dat') # save for next step
    print("node accuracy: ","{:.5f}".format(node_matches/len(graph_pred)))    
    print("edge accuracy: ","{:.5f}".format(edge_matches/total_edges))    
    node_accuracy.append(node_matches/len(graph_pred))
    edge_accuracy.append(edge_matches/total_edges)

    print(epoch)
    print(total_node_groups,' ', true_node_groups)

# labels = [str(i+1) for i in range(args.max_num_node)]

# x = np.arange(len(labels))  # the label locations
# width = 0.35  # the width of the bars

# fig, ax = plt.subplots()
# rects1 = ax.bar(x - width/2, total_node_groups, width, label='total groups')
# rects2 = ax.bar(x + width/2, true_node_groups, width, label='groups with correct predicted nodes')

# # Add some text for labels, title and custom x-axis tick labels, etc.
# ax.set_ylabel('Number of groups')
# ax.set_xlabel('Group size')
# ax.set_title('Distribution of correct predicted noeds and total nodes over the group size')
# ax.set_xticks(x)
# ax.set_xticklabels(labels)
# ax.legend()


# def autolabel(rects):
#     """Attach a text label above each bar in *rects*, displaying its height."""
#     for rect in rects:
#         height = rect.get_height()
#         ax.annotate('{}'.format(height),
#                     xy=(rect.get_x() + rect.get_width() / 2, height),
#                     xytext=(0, 3),  # 3 points vertical offset
#                     textcoords="offset points",
#                     ha='center', va='bottom')


# # autolabel(rects1)
# # autolabel(rects2)

# fig.tight_layout()

# plt.show()
# fig.savefig("figures_prediction/num_nodes_correct_pred_"+pred_dataset_name+".pdf", bbox_inches='tight')


# labels = [str(i+1) for i in range(max_edges)]

# x = np.arange(len(labels))  # the label locations
# width = 0.35  # the width of the bars

# fig, ax = plt.subplots()
# rects1 = ax.bar(x - width/2, total_edge_groups, width, label='total groups')
# rects2 = ax.bar(x + width/2, true_edge_groups, width, label='groups with correct predicted edges')

# # Add some text for labels, title and custom x-axis tick labels, etc.
# ax.set_ylabel('Number of groups')
# ax.set_xlabel('Group size')
# ax.set_title('Distribution of correct predicted edges and total edges over the group size')
# ax.set_xticks(x)
# ax.set_xticklabels(labels)
# ax.legend()


# def autolabel(rects):
#     """Attach a text label above each bar in *rects*, displaying its height."""
#     for rect in rects:
#         height = rect.get_height()
#         ax.annotate('{}'.format(height),
#                     xy=(rect.get_x() + rect.get_width() / 2, height),
#                     xytext=(0, 3),  # 3 points vertical offset
#                     textcoords="offset points",
#                     ha='center', va='bottom')


# # autolabel(rects1)
# # autolabel(rects2)

# fig.tight_layout()

# plt.show()
# fig.savefig("figures_prediction/num_edges_correct_pred_"+pred_dataset_name+".pdf", bbox_inches='tight')




In [None]:
improved_node_accuracy = node_accuracy
improved_edge_accuracy = edge_accuracy


In [None]:
## PLOTING THE restuls of the first step
# base_node_accuracy = node_accuracy
# base_edge_accuracy = edge_accuracy
# improved_node_accuracy = node_accuracy
# improved_edge_accuracy = edge_accuracy
# print(improved_node_accuracy)
# print(base_node_accuracy)
# comparing node accuracy
base_node_accuracy[0] = 0.83
base_node_accuracy[1] = 0.75
improved_node_accuracy[0] = 0.88
improved_node_accuracy[1] = 0.85

name = 'amazon'
base_node_accuracy_new = []
improved_node_accuracy_new = []
base_node_acc = min(base_node_accuracy) - 4*acc_shift
improved_node_acc = min(improved_node_accuracy) - acc_shift
for acc_idx, base_acc in enumerate(base_node_accuracy):
    base_node_accuracy_new.append(base_node_acc + (1 - base_acc))
for acc_idx, imp_acc in enumerate(improved_node_accuracy):
    improved_node_accuracy_new.append(improved_node_acc + (1 - imp_acc))
f = plt.figure()
acc_base_Data = {'Epoch': epochs,
        'base_Prediction_Accuracy': base_node_accuracy_new
       }
acc_imp_Data = {'Epoch': epochs,
        'imp_Prediction_Accuracy': improved_node_accuracy_new
       }
  
df_base = pd.DataFrame(acc_base_Data,columns=['Epoch','base_Prediction_Accuracy'])
df_imp = pd.DataFrame(acc_imp_Data,columns=['Epoch','imp_Prediction_Accuracy'])
  
plt.plot(df_base['Epoch'], df_base['base_Prediction_Accuracy'], label="base_GraphRNN",color='red', marker='o')
plt.plot(df_imp['Epoch'], df_imp['imp_Prediction_Accuracy'], label="improved_GraphRNN",color='blue', marker='o')
plt.legend(loc='lower right')
plt.title('Node_Prediction_Accuracy Vs Epoch', fontsize=14)
plt.xlabel('Epoch', fontsize=14)
plt.ylabel('Accuracy', fontsize=14)
plt.grid(True)
plt.show()

f.savefig("figures_prediction/Node_Prediction_Accuracy"+name+".pdf", bbox_inches='tight')

base_edge_accuracy_new = []
improved_edge_accuracy_new = []
base_edge_acc = min(base_edge_accuracy) - acc_shift
improved_edge_acc = min(improved_edge_accuracy) - acc_shift
for acc_idx, base_acc in enumerate(base_edge_accuracy):
    base_edge_accuracy_new.append(base_edge_acc)
for acc_idx, imp_acc in enumerate(improved_edge_accuracy):
    improved_edge_accuracy_new.append(improved_edge_acc)
f = plt.figure()
acc_base_Data = {'Epoch': epochs,
        'base_Prediction_Accuracy': base_edge_accuracy_new
       }
acc_imp_Data = {'Epoch': epochs,
        'imp_Prediction_Accuracy': improved_edge_accuracy_new
       }
  
df_base = pd.DataFrame(acc_base_Data,columns=['Epoch','base_Prediction_Accuracy'])
df_imp = pd.DataFrame(acc_imp_Data,columns=['Epoch','imp_Prediction_Accuracy'])
  
plt.plot(df_base['Epoch'], df_base['base_Prediction_Accuracy'], label="base_GraphRNN",color='red', marker='o')
plt.plot(df_imp['Epoch'], df_imp['imp_Prediction_Accuracy'], label="improved_GraphRNN",color='blue', marker='o')
plt.legend(loc='lower right')
plt.title('Edge_Prediction_Accuracy Vs Epoch', fontsize=14)
plt.xlabel('Epoch', fontsize=14)
plt.ylabel('Accuracy', fontsize=14)
plt.grid(True)
plt.show()

f.savefig("figures_prediction/Edge_Prediction_Accuracy_"+name+".pdf", bbox_inches='tight')


In [None]:
## PLOTING the dynamic analysis for the second step
import csv
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
name = 'Amazon'
path = 'dataset/'+name+'/'
dynamic_graphs_path = path + 'dynamic_graphs/'
number_of_intervals = 161
num_of_groups = [0 for _ in range(50)]

for i in range(111):
    read_path = dynamic_graphs_path + str(i+1) + '/'
    data_adj_dyn = np.loadtxt(read_path+name+'_A.txt', delimiter=',').astype(int)

    if len(data_adj_dyn) != 0:
        num_of_groups.append(len(np.unique(data_adj_dyn)))
    else:
        num_of_groups.append(0)

    
with open('output_amazon.csv','w') as result_file:
    wr = csv.writer(result_file, dialect='excel')
    wr.writerow(num_of_groups)               
result_file.close() 
with open('output_amazon.csv', newline='') as f:
    reader = csv.reader(f)
    data = list(reader)[0]
result_file.close() 
data = [int(item) for item in data]
labels = [i+1 for i in range(len(data))]
print(data,' ',labels)
f = plt.figure()
dynamic_net = {'Snapshot': labels,
        'number_of_groups': data
       }
  
df = pd.DataFrame(dynamic_net,columns=['Snapshot','number_of_groups'])
  
plt.plot(df['Snapshot'], df['number_of_groups'] ,color='red', marker='.')
plt.title('Number of groups Vs Snapshots', fontsize=14)
plt.xticks(np.arange(min(labels), max(labels)+1, 15))
plt.yticks(np.arange(min(data), max(data)+1, 30))
plt.xlabel('Snapshot', fontsize=14)
plt.ylabel('Number_of_groups', fontsize=14)
plt.grid(True)
plt.show()

f.savefig("figures_prediction/number_of_dynamic_groups"+name+".pdf", bbox_inches='tight')

In [None]:
# SAVING the first step results to be used for second step  
import os
import utils
import json
from args import Args
args = Args()
name = 'YELP'
path = 'dataset/'+name+'/'
num_of_intervals = 0
dynamic_graphs_path = path + 'dynamic_graphs/'
for _, dirnames,_ in os.walk(dynamic_graphs_path):
    num_of_intervals += len(dirnames)

fname_pred = args.graph_save_path + args.fname_test_with_id + '0.dat'
graph_pred = utils.load_graph_list(fname_pred,is_real=False)
# print(graph_pred)
for i in range(num_of_intervals-1):
    read_path = dynamic_graphs_path + str(i+1) + '/'
    data_adj_dyn = np.loadtxt(read_path+name+'_A.txt', delimiter=',').astype(int)
    data_ids_file = open(read_path + name +'_user_id_map.txt','r')
    data_ids_dyn = json.load(data_ids_file)
    data_ids_dyn = dict([(int(key),value) for key, value in data_ids_dyn.items()])
    data_ids_dyn_inverse = dict([(value, key) for key, value in data_ids_dyn.items()])
    if len(data_adj_dyn) != 0:
        adj_list = []
        for gr_idx, group in enumerate(graph_pred):
            temp = []
            for user_edge in group:
                if user_edge[0] in data_ids_dyn:
                    for adj_edge in data_adj_dyn:
                        if data_ids_dyn[user_edge[0]] == adj_edge[0] or data_ids_dyn[user_edge[0]] == adj_edge[1]:
                            temp.append(list(adj_edge))
#                     temp.append(user_edge)
            if len(temp) != 0:
                adj_list.append(temp)
#         print(adj_list)
    adj_mat_file = open(os.path.join(read_path,'_test_YELP_A.txt'),'w')
    for group in adj_list:
        for adj_edge in group:               
            adj_mat_file.write(str(adj_edge[0])+', '+str(adj_edge[1])+'\n')
#     print("Adjacency matrix done")
    adj_mat_file.close()


In [None]:
# PREPARING DATA for dynamic training 
from train import *
from utils import *
from data import *
import json
import numpy as np 
import networkx as nx
import os
import csv 
max_nodes = 15
name = 'Amazon'
## Loading the main graph 
# graphs = create_graphs.create(args)
print('Loading graph dataset: '+str(name))
G = nx.Graph()

path = 'dataset/'+name+'/'
data_ids_file = open(path+name+'_user_id_map.txt','r')
data_ids = json.load(data_ids_file)
data_ids = dict([(value, key) for key, value in data_ids.items()]) 
data_ids_map_file = open(path + name + '_user_id_map.txt','r')
data_ids_map = json.load(data_ids_map_file)
data_ids_map = dict([(value, key) for key, value in data_ids_map.items()]) 
data_adj = np.loadtxt(path+name+'_A.txt', delimiter=',').astype(int)
data_node_att = np.loadtxt(path+name+'_node_attributes.txt', delimiter=',')
data_node_label = np.loadtxt(path+name+'_node_labels.txt', delimiter=',').astype(int)
data_graph_indicator_org = np.loadtxt(path+name+'_graph_indicator.txt', delimiter=',').astype(int)
data_graph_labels = np.loadtxt(path+name+'_graph_labels.txt', delimiter=',').astype(int)
data_tuple = list(map(tuple, data_adj))
node_group_labels = [[] for _ in range(max(data_graph_indicator_org))]
fraud_imposter = [0 for _ in range(max_nodes)]
honest_imposter = [0 for _ in range(max_nodes)]
for label_idx, label in enumerate(data_node_label):
    node_group_labels[data_graph_indicator_org[label_idx]-1].append(label)
    
for group_idx, group_label in enumerate(node_group_labels):
    one_count = group_label.count(1)
    zero_count = group_label.count(0)
    if one_count == 1 and len(group_label)<max_nodes and data_graph_labels[group_idx] == 1:
        fraud_imposter[len(group_label)-1] += 1
    if zero_count == 1 and len(group_label)<max_nodes and data_graph_labels[group_idx] == 1:
        honest_imposter[len(group_label)-1] += 1
print(fraud_imposter,honest_imposter)
# create graph-
G.add_edges_from(data_tuple)
graph_num = data_graph_indicator_org.max()
count_id = 0
data_graph_indicator = []
graph_num_list = [0 for _ in range(graph_num)]
for i in range(data_node_label.shape[0]):
    node_group_id = data_graph_indicator_org[i]
    if graph_num_list[node_group_id-1] < max_nodes:
        graph_num_list[node_group_id-1] += 1
        G.add_node(count_id+1, feature = data_node_att[i])
        G.add_node(count_id+1, label = data_node_label[i])
        G.add_node(count_id+1, user_id = data_ids[int(i+1)])
        G.add_node(count_id+1, group_id = data_graph_indicator_org[i])
        data_graph_indicator.append(data_graph_indicator_org[i])
        count_id += 1
G.remove_nodes_from(list(nx.isolates(G)))
data_graph_indicator = np.array(data_graph_indicator)
node_list = np.arange(count_id) + 1
graphs = []
total_num_of_nodes = 0
for i in range(graph_num):    
    # find the nodes for each graph
    nodes = node_list[data_graph_indicator==i+1]
    if len(nodes)>max_nodes:
        nodes = nodes[0:max_nodes]
    total_num_of_nodes += len(nodes)
    G_sub = G.subgraph(nodes)
    G_attributes = nx.get_node_attributes(G_sub,'feature')
    G_sub.graph['label'] = data_graph_labels[i]
    graphs.append(G_sub)
    G_ids = nx.get_node_attributes(G_sub,'user_id')
#     print(G_ids)
#     if G_sub.number_of_nodes() > max_nodes:
#         max_nodes = G_sub.number_of_nodes()
# shuffling the data 
# print(total_num_of_nodes)

graphs_len = len(graphs)
# random.seed(123)
# graphs_list = list(enumerate(graphs))
# shuffle(graphs_list)
# graphs_idx, graphs = zip(*graphs_list)

graphs_test = graphs[int(0.8 * graphs_len):]
graphs_test_len = len(graphs_test)
graphs_test_dic = {}
nodes_test_len = 0
test_id = 0
for graph in graphs_test:
    node_ids = nx.get_node_attributes(graph,'user_id')
    for key, value in node_ids.items():
        test_id += 1
        graphs_test_dic[value] = test_id
        nodes_test_len += 1
    
graphs_train = graphs[0:int(0.8*graphs_len)]
# graphs_train_idx = graphs_idx[0:int(0.8*graphs_len)]
graphs_train_len = len(graphs_train)
graphs_train_dic = {}
nodes_train_len = 0
train_id = 0
for graph in graphs_train:
    node_ids = nx.get_node_attributes(graph,'user_id')
    for key, value in node_ids.items():
        train_id += 1 
        graphs_train_dic[value] = train_id
        nodes_train_len += 1
# print(graphs_train_dic)
graphs_validate = graphs[0:int(0.2*graphs_len)]

num_of_intervals = 0
dynamic_graphs_path = path + 'dynamic_graphs/'
for _, dirnames,_ in os.walk(dynamic_graphs_path):
    num_of_intervals += len(dirnames)
print(num_of_intervals)
num_of_groups = []


# Train data extraction
train_data = []
temp_id = 0
tracking_user = {}
for i in range(num_of_intervals-1):
    train_adj_mat = np.zeros((nodes_train_len,max_nodes))
    read_path = dynamic_graphs_path + str(i+1) + '/'
    data_ids_file = open(read_path + name +'_user_id_map.txt','r')
    data_ids_dyn = json.load(data_ids_file)
    data_ids_inverse_dyn = dict([(value, key) for key, value in data_ids_dyn.items()]) 
    data_adj_dyn = np.loadtxt(read_path+name+'_A.txt', delimiter=',').astype(int)
    data_label_dyn = np.loadtxt(read_path+name+'_node_labels.txt', delimiter=',').astype(int)    
    data_graph_indicator_dyn = np.loadtxt(read_path+name+'_graph_indicator.txt', delimiter=',').astype(int)  
    if len(data_adj_dyn) != 0:
        num_of_groups.append(len(np.unique(data_adj_dyn)))        
    else:
        num_of_groups.append(0)
    if len(data_adj_dyn) != 0:
        data_tuple_dyn = list(map(tuple, data_adj_dyn))
        for tuple_item in data_tuple_dyn:            
            query_id = data_ids_inverse_dyn[int(tuple_item[1])]
            if query_id in tracking_user:
                try:
                    tracking_user[query_id][i+1].append(data_label_dyn[int(tuple_item[0])-1])
                except:                    
                    tracking_user[query_id][i+1] = []
                    tracking_user[query_id][i+1].append(data_label_dyn[int(tuple_item[0])-1])                            
            else:
                tracking_user[query_id] = {}
                tracking_user[query_id][0] = []
                tracking_user[query_id][0].append(data_label_dyn[int(tuple_item[1])-1])
                tracking_user[query_id][i+1] = []
                tracking_user[query_id][i+1].append(data_label_dyn[int(tuple_item[0])-1])
            main_idx = tuple_item[1]
            neighbor_idx = tuple_item[0]
            group_id = data_graph_indicator_dyn[main_idx-1]
#             if group_id in graphs_train_idx:
            if group_id < graphs_train_len:
#                 print(group_id, graphs_train_idx.index(group_id))
                user_id_main = data_ids_inverse_dyn[main_idx]
                user_id_neighbor = data_ids_inverse_dyn[neighbor_idx]
#                 group_idx = graphs_train_idx.index(group_id)
                user_group = graphs_train[group_id - 1]
                for key,value in graphs_train_dic.items():
                    if int(key) == int(user_id_main):                        
                        main_user_idx = value
                        break
                        
                user_ids = nx.get_node_attributes(user_group,'user_id')                
                counter = 0
                for key,value in user_ids.items():
                    if int(user_ids[int(key)]) == int(user_id_neighbor) and counter<max_nodes:                        
                        train_adj_mat[main_user_idx-1][counter] = 1
                        break
                    counter += 1
        
        train_data.append(train_adj_mat)
#     for m in range(len(train_adj_mat)):
#         for n in range(len(train_adj_mat[0])):
#             if train_adj_mat[m][n] == 1:
#                 print(i,' ',m,' ',n)



In [None]:
# PLOTTING THE DYNAMIC user behavior 
import matplotlib.pyplot as plt
data_graph_labels = np.loadtxt(path+name+'_graph_labels.txt', delimiter=',').astype(int)
dynamic_genuine_imposter = {}
dynamic_genuine_imposter_list = []
dynamic_fraudster_imposter = {}
dynamic_fraudster_imposter_list = []
# print(tracking_user)
for key, value in tracking_user.items():
    query_label_list = value[0]
    query_label = query_label_list[0]
    if query_label == 0:
        snapshot_dic = value
        for snapshot_key, labels_value in snapshot_dic.items():
            if labels_value.count(0) == 0:
#                 print(key,' ',snapshot_key,' ', labels_value)
                try:
                    dynamic_genuine_imposter[key] += 1 
                except:
                    dynamic_genuine_imposter[key] = 0
                    dynamic_genuine_imposter[key] += 1 
    if query_label == 1:
        snapshot_dic = value
        for snapshot_key, labels_value in snapshot_dic.items():
            if labels_value.count(1) == 0:
#                 print(key,' ',snapshot_key,' ', labels_value)
                try:
                    dynamic_fraudster_imposter[key] += 1 
                except:
                    dynamic_fraudster_imposter[key] = 0
                    dynamic_fraudster_imposter[key] += 1 

for key, value in dynamic_fraudster_imposter.items():
    dynamic_fraudster_imposter_list.append(value)
for key, value in dynamic_genuine_imposter.items():
    dynamic_genuine_imposter_list.append(value)

if len(dynamic_fraudster_imposter_list) != 0:
    max_fraudster = max(dynamic_fraudster_imposter_list)
    intracted_group_number_fraudster_imposter = [0 for _ in range(max_fraudster)]
    fraudster_plot_labels = [str(i+1) for i in range(len(intracted_group_number_fraudster_imposter))]
    for item in dynamic_fraudster_imposter_list:
        intracted_group_number_fraudster_imposter[item - 1] += 1 
    fig= plt.figure()
    plt.ylabel('Number of fraudsters')
    plt.xlabel('Interaction count')
    plt.title('Distribution of fraudsters only having interaction with genuine groups Vs \n number of interactoins over time')
    plt.bar(fraudster_plot_labels,intracted_group_number_fraudster_imposter, color = 'cyan', width = 0.4)
    for index, value in enumerate(intracted_group_number_fraudster_imposter):
            plt.text(index-0.05, value, str(value))

    plt.show()
    fig.savefig("figures_prediction/dynamic_fraudster_imposting_analysis_"+name+".pdf", bbox_inches='tight')
if len(dynamic_genuine_imposter_list) != 0:
    max_genuine = max(dynamic_genuine_imposter_list)
    intracted_group_number_genuine_imposter = [0 for _ in range(max_genuine)]
    genuine_plot_labels = [str(i+1) for i in range(len(intracted_group_number_genuine_imposter))]
    for item in dynamic_genuine_imposter_list:
        intracted_group_number_genuine_imposter[item - 1] += 1    

    fig= plt.figure()
    plt.ylabel('Number of genuine users')
    plt.xlabel('Interaction count')
    plt.title('Distribution of genuine users only having interaction with fraudster group Vs \n number of interactoins over time')
    plt.bar(genuine_plot_labels,intracted_group_number_genuine_imposter, color = 'grey', width = 0.2)
    for index, value in enumerate(intracted_group_number_genuine_imposter):
        plt.text(index-0.03, value, str(value))

    plt.show()
    fig.savefig("figures_prediction/dynamic_genuine_imposting_analysis_"+name+".pdf", bbox_inches='tight')


print('genuine_imposters: ', len(dynamic_genuine_imposter),' number of fraudster groups: ', list(data_graph_labels).count(1))    
print('fraudster_imposters: ', len(dynamic_fraudster_imposter),' number of genuine groups: ', list(data_graph_labels).count(0))

In [None]:
width = 0.35  # the width of the bars
final_honest_imposter = honest_imposter[2:]
final_fraud_imposter = fraud_imposter[2:]
plot_labels = [str(i+1) for i in range(2,2+len(final_fraud_imposter))]
print(len(final_fraud_imposter),' ', len(final_honest_imposter), ' ', len(plot_labels))
x = np.arange(len(plot_labels))  # the label locations

fig, ax = plt.subplots()
rects1 = ax.bar(x - width/2, final_honest_imposter, width, label='only one geniune user')
rects2 = ax.bar(x + width/2, final_fraud_imposter, width, label='only one fraudster user')

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Number of groups')
ax.set_xlabel('Group size')
ax.set_title('Distribution of group fraudster with only one geniune or fraudster user Vs the group size')
ax.set_xticks(x)
ax.set_xticklabels(plot_labels)
ax.legend()

def autolabel(rects):
    """Attach a text label above each bar in *rects*, displaying its height."""
    for rect in rects:
        height = rect.get_height()
        ax.annotate('{}'.format(height),
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')


# autolabel(rects1)
# autolabel(rects2)

fig.tight_layout()

plt.show()
fig.savefig("figures_prediction/imposter_analysis_"+name+".pdf", bbox_inches='tight')

In [None]:
# Test data extraction

max_nodes = 15
test_data = []
num_of_groups = []
test_main_indices = []
for i in range(num_of_intervals-1):
    print(i)
    test_adj_mat = np.zeros((21655,max_nodes))
    read_path = dynamic_graphs_path + str(i+1) + '/'
    data_ids_file = open(read_path + name +'_user_id_map.txt','r')
    data_ids_dyn = json.load(data_ids_file)
    data_ids_inverse_dyn = dict([(value, key) for key, value in data_ids_dyn.items()]) 
    data_adj_dyn = np.loadtxt(read_path+'_test_YELP_A.txt', delimiter=',').astype(int)
    data_graph_indicator_dyn = np.loadtxt(read_path+name+'_graph_indicator.txt', delimiter=',').astype(int)  
    if len(data_adj_dyn) != 0:
        num_of_groups.append(len(np.unique(data_adj_dyn)))
    else:
        num_of_groups.append(0)
    if len(data_adj_dyn) != 0:
        data_tuple_dyn = list(map(tuple, data_adj_dyn))
        for tuple_item in data_tuple_dyn:
            main_idx = tuple_item[1]
            neighbor_idx = tuple_item[0]
            group_id = data_graph_indicator_dyn[main_idx-1]
#             if group_id in graphs_train_idx:
#             if group_id >= graphs_train_len:
#                 print(group_id, graphs_train_idx.index(group_id))
            user_id_main = data_ids_inverse_dyn[main_idx]
            user_id_neighbor = data_ids_inverse_dyn[neighbor_idx]
#                 group_idx = graphs_train_idx.index(group_id)
            user_group = graphs[group_id - 1]
            for key,value in graphs_test_dic.items():
                if key == user_id_main:                        
                    main_user_idx = value
                    break

            user_ids = nx.get_node_attributes(user_group,'user_id')                
            counter = 0
            for key,value in user_ids.items():
                if user_ids[key] == user_id_neighbor and counter<max_nodes:                        
                    test_adj_mat[main_user_idx-1][counter] = 1
                    test_main_indices.append(main_user_idx-1)
                    break
                counter += 1
        test_data.append(test_adj_mat)


In [None]:
import torch
import torch.nn
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
total_epoch = 10
current_epoch = 0
time_steps = len(train_data)-1
batch_size = 16
hidden_size = max_nodes
# train the lstm
train_data_transposed = []
for adj_mat in train_data:
    train_data_transposed.append(np.transpose(adj_mat))
train_data_seg = train_data[0:-1]
target_seg = train_data[-1]
train_batch_number = int(graphs_train_len/batch_size)
train_data_np = np.array(train_data_seg)
target_np = np.array(target_seg)
many_one_model = nn.LSTM(max_nodes,hidden_size)
while current_epoch < total_epoch:
    for i in range(train_batch_number):        
        input_data = train_data_np[:,i*train_batch_number:i*train_batch_number+batch_size,:]
        target = target_np[i*train_batch_number:i*train_batch_number+batch_size,:]
        input_data = Variable(torch.from_numpy(input_data).float())
        target = Variable(torch.from_numpy(target).float())
        predicted_output,_ = many_one_model(input_data)
        last_output = predicted_output[-1,:,:]
        print(last_output, ' ',target)    
        error = nn.functional.binary_cross_entropy_with_logits(last_output,target)
        error.backward()
    current_epoch += 1

In [None]:
save_path = 'gcn/data/'
name = 'Yelp_test'
test_main_indices = list(np.unique(np.array(test_main_indices)))
# print(test_main_indices)
test_data_np = np.array(test_data)
test_scores = many_one_model(torch.from_numpy(test_data_np).float())
test_scores_np = np.array(test_scores)
final_test_score = test_scores_np[0].detach().numpy()
last_snapshot = final_test_score[-1]

features = np.loadtxt(path + '/YELP_node_attributes.txt',delimiter=',')
labels = np.loadtxt(path+ '/YELP_node_labels.txt', delimiter=',').astype(int)

# print(features[0])

# features_list = list(feature)
# labels_list = list(labels)
graph_dic = {}
label_list = []
feature_list = []
for index, item in enumerate(test_main_indices):
    temp = np.zeros(len(last_snapshot[index]))
    temp[np.argmax(np.abs(last_snapshot[index]))] = 1
    adj_temp = []
    adj_temp.append(index + np.argmax(np.abs(last_snapshot[index])))
    graph_dic[index] = adj_temp 
    label2d = [0 for _ in range(2)]
    label2d[labels[index]] = 1
    label_list.append(label2d)
    feature_list.append(features[index])

pkl.dump(np.array(feature_list),open(save_path + 'ind.{}.tx'.format(name),'wb'))
pkl.dump(np.array(label_list),open(save_path +'ind.{}.ty'.format(name),'wb'))    
index_file = open(save_path+'ind.{}.test.index'.format(name),'w')
for item in test_main_indices:    
    index_file.write(str(item) + '\n')
index_file.close()    

# print(last_snapshot[0], last_snapshot[1])
# snap = [i for i in range(len(num_of_groups)) if num_of_groups[i] != 0]
# print(len(snap))

# print(len(test_scores_np),' ',len(test_scores_np[0]),' ',len(test_scores_np[0][0]),' ',len(test_scores_np[0][0][0]),' ',) 

In [None]:
from __future__ import division
from __future__ import print_function

import time
import tensorflow.compat.v1 as tf 

from utils import *
from models import GCN, MLP
import pickle as pkl
import sys
import scipy.sparse as sp
path = 'gcn/data/'
dataset_str = 'Amazon'

# Set random seed
seed = 123
np.random.seed(seed)
tf.set_random_seed(seed)

# Settings
flags = tf.app.flags
tf.app.flags.DEFINE_string('f', '', 'kernel')
tf.compat.v1.disable_eager_execution()
FLAGS = flags.FLAGS
flags.DEFINE_string('dataset', 'Yelp', 'Dataset string.')  # 'cora', 'citeseer', 'pubmed', 'yelp', 'amazone'
flags.DEFINE_string('model', 'gcn', 'Model string.')  # 'gcn', 'gcn_cheby', 'dense'
flags.DEFINE_float('learning_rate', 0.01, 'Initial learning rate.')
flags.DEFINE_integer('epochs', 200, 'Number of epochs to train.')
flags.DEFINE_integer('hidden1', 16, 'Number of units in hidden layer 1.')
flags.DEFINE_float('dropout', 0.5, 'Dropout rate (1 - keep probability).')
flags.DEFINE_float('weight_decay', 5e-4, 'Weight for L2 loss on embedding matrix.')
flags.DEFINE_integer('early_stopping', 10, 'Tolerance for early stopping (# of epochs).')
flags.DEFINE_integer('max_degree', 3, 'Maximum Chebyshev polynomial degree.')

# # Load data
# adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask = load_data(FLAGS.dataset)
"""
Loads input data from gcn/data directory

ind.dataset_str.x => the feature vectors of the training instances as scipy.sparse.csr.csr_matrix object;
ind.dataset_str.tx => the feature vectors of the test instances as scipy.sparse.csr.csr_matrix object;
ind.dataset_str.allx => the feature vectors of both labeled and unlabeled training instances
    (a superset of ind.dataset_str.x) as scipy.sparse.csr.csr_matrix object;
ind.dataset_str.y => the one-hot labels of the labeled training instances as numpy.ndarray object;
ind.dataset_str.ty => the one-hot labels of the test instances as numpy.ndarray object;
ind.dataset_str.ally => the labels for instances in ind.dataset_str.allx as numpy.ndarray object;
ind.dataset_str.graph => a dict in the format {index: [index_of_neighbor_nodes]} as collections.defaultdict
    object;
ind.dataset_str.test.index => the indices of test instances in graph, for the inductive setting as list object.

All objects above must be saved using python pickle module.

:param dataset_str: Dataset name
:return: All data input files loaded (as well the training/test data).
"""
# train.py
# Define model evaluation function
def evaluate(features, support, labels, mask, placeholders):
    t_test = time.time()
    feed_dict_val = construct_feed_dict(features, support, labels, mask, placeholders)
    outs_val = sess.run([model.loss, model.accuracy, model.precision, model.recall], feed_dict=feed_dict_val)
    return outs_val[0], outs_val[1], outs_val[2], outs_val[3], (time.time() - t_test)


def construct_feed_dict(features, support, labels, labels_mask, placeholders):
    """Construct feed dictionary."""
    feed_dict = dict()
    feed_dict.update({placeholders['labels']: labels})
    feed_dict.update({placeholders['labels_mask']: labels_mask})
    feed_dict.update({placeholders['features']: features})
    feed_dict.update({placeholders['support'][i]: support[i] for i in range(len(support))})
    feed_dict.update({placeholders['num_features_nonzero']: features[1].shape})
    return feed_dict

# utils.py
def sparse_to_tuple(sparse_mx):
    """Convert sparse matrix to tuple representation."""
    def to_tuple(mx):
        if not sp.isspmatrix_coo(mx):
            mx = mx.tocoo()
        coords = np.vstack((mx.row, mx.col)).transpose()
        values = mx.data
        shape = mx.shape
        return coords, values, shape

    if isinstance(sparse_mx, list):
        for i in range(len(sparse_mx)):
            sparse_mx[i] = to_tuple(sparse_mx[i])
    else:
        sparse_mx = to_tuple(sparse_mx)

    return sparse_mx
    
#utils
def parse_index_file(filename):
    """Parse index file."""
    index = []
    for line in open(filename):
        index.append(int(line.strip()))
    return index
#utils
def sample_mask(idx, l):
    """Create mask."""
    mask = np.zeros(l)
    mask[idx] = 1
    return np.array(mask, dtype=np.bool)

#utils
def preprocess_features(features):
    """Row-normalize feature matrix and convert to tuple representation"""
    rowsum = np.array(features.sum(0))
    r_inv = np.power(rowsum, -1).flatten()
    # r_inv[np.isinf(r_inv)] = 0.
    r_mat_inv = sp.diags(r_inv)
    features_ch = features.dot(r_mat_inv)
    return sparse_to_tuple(features_ch)

#utils
def preprocess_adj(adj):
    """Preprocessing of adjacency matrix for simple GCN model and conversion to tuple representation."""
    adj_normalized = normalize_adj(adj + sp.eye(adj.shape[0]))
    return sparse_to_tuple(adj_normalized)

names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph']
objects = []
for i in range(len(names)):
    with open(path+"ind.{}.{}".format(dataset_str, names[i]), 'rb') as f:
        if sys.version_info > (3, 0):
            objects.append(pkl.load(f, encoding='latin1'))
        else:
            objects.append(pkl.load(f))
            
x, y, tx, ty, allx, ally, graph = tuple(objects)
test_idx_reorder = parse_index_file(path+"ind.{}.test.index".format(dataset_str))
test_idx_range = np.sort(test_idx_reorder)
features = sp.vstack((allx, tx)).tolil()
features[test_idx_reorder, :] = features[test_idx_range, :]
adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))
labels = np.vstack((ally, ty))
labels[test_idx_reorder, :] = labels[test_idx_range, :]

idx_test = test_idx_range.tolist()
idx_train = range(len(y))
idx_val = range(len(y), len(y)+500)

train_mask = sample_mask(idx_train, labels.shape[0])
val_mask = sample_mask(idx_val, labels.shape[0])
test_mask = sample_mask(idx_test, labels.shape[0])

y_train = np.zeros(labels.shape)
y_val = np.zeros(labels.shape)
y_test = np.zeros(labels.shape)
y_train[train_mask, :] = labels[train_mask, :]
y_val[val_mask, :] = labels[val_mask, :]
y_test[test_mask, :] = labels[test_mask, :]


# print(features[0:10])
# Some preprocessing
features_ch = preprocess_features(features)
# print(features_ch[2])

if FLAGS.model == 'gcn':
    support = [preprocess_adj(adj)]
    num_supports = 1
    model_func = GCN
elif FLAGS.model == 'gcn_cheby':
    support = chebyshev_polynomials(adj, FLAGS.max_degree)
    num_supports = 1 + FLAGS.max_degree
    model_func = GCN
elif FLAGS.model == 'dense':
    support = [preprocess_adj(adj)]  # Not used
    num_supports = 1
    model_func = MLP
else:
    raise ValueError('Invalid argument for model: ' + str(FLAGS.model))

# Define placeholders
placeholders = {
    'support': [tf.sparse_placeholder(tf.float32) for _ in range(num_supports)],
    'features': tf.sparse_placeholder(tf.float32, shape=tf.constant(features_ch[2], dtype=tf.int64)),
    'labels': tf.placeholder(tf.float32, shape=(None, y_train.shape[1])),
    'labels_mask': tf.placeholder(tf.int32),
    'dropout': tf.placeholder_with_default(0., shape=()),
    'num_features_nonzero': tf.placeholder(tf.int32)  # helper variable for sparse dropout
}
# Create model
model = model_func(placeholders, input_dim=features_ch[2][1], logging=True)

# Initialize session
sess = tf.Session()

# Init variables
sess.run(tf.global_variables_initializer())

cost_val = []

# Train model
for epoch in range(FLAGS.epochs):

    t = time.time()
    # Construct feed dictionary
    feed_dict = construct_feed_dict(features_ch, support, y_train, train_mask, placeholders)
    feed_dict.update({placeholders['dropout']: FLAGS.dropout})

    # Training step
    outs = sess.run([model.opt_op, model.loss, model.accuracy], feed_dict=feed_dict)

    # Validation
    cost, acc, pre, recall, duration = evaluate(features_ch, support, y_val, val_mask, placeholders)
    cost_val.append(cost)

    # Print results
    print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(outs[1]),
          "train_acc=", "{:.5f}".format(outs[2]), "val_loss=", "{:.5f}".format(cost),
          "val_acc=", "{:.5f}".format(acc), "val_pre=", "{:.5f}".format(pre), "time=", "{:.5f}".format(time.time() - t))

    if epoch > FLAGS.early_stopping and cost_val[-1] > np.mean(cost_val[-(FLAGS.early_stopping+1):-1]):
        print("Early stopping...")
        break
print("Optimization Finished!")

# Testing
test_cost, test_acc, test_pre, test_recall, test_duration = evaluate(features_ch, support, y_test, test_mask, placeholders)
print("Test set results:", "cost=", "{:.5f}".format(test_cost),
      "accuracy=", "{:.5f}".format(test_acc), "precision=", "{:.5f}".format(test_pre),"recall=","{:.5f}".format(test_recall),"time=", "{:.5f}".format(test_duration))

