In [2]:
from evaluation.metrics import get_statistics
import numpy as np
import utils.graph_utils as graph_utils
import json
import os
from scipy.io import loadmat
import numpy as np
from networkx.readwrite import json_graph


In [4]:
source_dataset_URL="graph_data/douban/online/graphsage/"
target_dataset_URL="graph_data/douban/offline/graphsage/"
groundtruth="graph_data/douban/dictionaries/groundtruth"
class Dataset:
    """
    this class receives input from graphsage format with predefined folder structure, the data folder must contains these files:
    G.json, id2idx.json, features.npy (optional)

    Arguments:
    - data_dir: Data directory which contains files mentioned above.
    """

    def __init__(self, data_dir):
        self.data_dir = data_dir
        self._load_id2idx()
        self._load_G()
        self._load_features()
        #构建adj的同时把idx格式的edge存入了edges.edgelist.npy里面
        graph_utils.construct_adjacency(self.G, self.id2idx, sparse=False, file_path=self.data_dir + "/edges.edgelist")
        #self.load_edge_features()
        print("Dataset info:")
        print("- Nodes: ", len(self.G.nodes()))
        print("- Edges: ", len(self.G.edges()))

    def _load_G(self):
        G_data = json.load(open(os.path.join(self.data_dir, "G.json")))
        G_data['links'] = [{'source': self.idx2id[G_data['links'][i]['source']], 'target': self.idx2id[G_data['links'][i]['target']]} for i in range(len(G_data['links']))]
        self.G = json_graph.node_link_graph(G_data)


    def _load_id2idx(self):
        id2idx_file = os.path.join(self.data_dir, 'id2idx.json')
        self.id2idx = json.load(open(id2idx_file))
        self.idx2id = {v:k for k,v in self.id2idx.items()}


    def _load_features(self):
        self.features = None
        feats_path = os.path.join(self.data_dir, 'feats.npy')
        if os.path.isfile(feats_path):
            self.features = np.load(feats_path)
        else:
            self.features = None
        return self.features
    

    def load_edge_features(self):
        self.edge_features= None
        feats_path = os.path.join(self.data_dir, 'edge_feats.mat')
        if os.path.isfile(feats_path):
            edge_feats = loadmat(feats_path)['edge_feats']
            self.edge_features = np.zeros((len(edge_feats[0]),
                                           len(self.G.nodes()),
                                           len(self.G.nodes())))
            for idx, matrix in enumerate(edge_feats[0]):
                self.edge_features[idx] = matrix.toarray()
        else:
            self.edge_features = None
        return self.edge_features

    def get_adjacency_matrix(self, sparse=False):
        return graph_utils.construct_adjacency(self.G, self.id2idx, sparse=False, file_path=self.data_dir + "/edges.edgelist")

    def get_nodes_degrees(self):
        return graph_utils.build_degrees(self.G, self.id2idx)

    def get_nodes_neighbors(self):
        return graph_utils.build_neighbors(self.G, self.id2idx)

    def get_nodes_clustering(self):
        return graph_utils.build_clustering(self.G, self.id2idx)

    def get_edges(self):
        return graph_utils.get_edges(self.G, self.id2idx)

    def check_id2idx(self):
        # print("Checking format of dataset")
        for i, node in enumerate(self.G.nodes()):
            if (self.id2idx[node] != i):
                print("Failed at node %s" % str(node))
                return False
        # print("Pass")
        return True

source_dataset = Dataset(source_dataset_URL)
target_dataset = Dataset(target_dataset_URL)
groundtruth = graph_utils.load_gt(groundtruth, source_dataset.id2idx, target_dataset.id2idx, 'dict')

Dataset info:
- Nodes:  3906
- Edges:  8164
Dataset info:
- Nodes:  1118
- Edges:  1511


In [5]:
S = np.load('../GradAlign_plus-main/S_file.npy')
acc, MAP, top5, top10, AUC,top1 = get_statistics(S, groundtruth, use_greedy_match=False, get_all_metric=True)
print("Accuracy: {:.4f}".format(acc))
print("MAP: {:.4f}".format(MAP))
print("AUC: {:.4f}".format(AUC))
print("Precision_1: {:.4f}".format(top1))
print("Precision_5: {:.4f}".format(top5))
print("Precision_10: {:.4f}".format(top10))

Accuracy: 0.0000
MAP: 0.0060
AUC: 0.4963
Precision_1: 0.0000
Precision_5: 0.0036
Precision_10: 0.0107


In [13]:
def top_k(S, k=1):
    """
    S: scores, numpy array of shape (M, N) where M is the number of source nodes,
        N is the number of target nodes
    k: number of predicted elements to return
    """
    top = np.argsort(-S)[:, :k]
    result = np.zeros(S.shape)
    for idx, target_elms in enumerate(top):
        for elm in target_elms:
            result[idx, elm] = 1

    return result

def compute_precision_k(top_k_matrix, gt):
    n_matched = 0
    for key, value in gt.items():
        #if top_k_matrix[idx1_dict[key], idx2_dict[value]] == 1:
        if top_k_matrix[key, value] == 1:
            n_matched += 1
    return n_matched/len(gt)

In [22]:
type(groundtruth)

dict

In [14]:
top_1 = top_k(S, 1)
top1_eval = compute_precision_k(top_1, groundtruth)

In [None]:
top_1 = top_k(S, 1)
top_5 = top_k(S, 5)
top_10 = top_k(S, 10)

top1_eval = compute_precision_k(top_1, groundtruth, idx1_dict, idx2_dict)
top5_eval = compute_precision_k(top_5, gt_dict, idx1_dict, idx2_dict)
top10_eval = compute_precision_k(top_10, gt_dict, idx1_dict, idx2_dict)

print('Success@1 : {:.4f}'.format(top1_eval))
print('Success@5 : {:.4f}'.format(top5_eval))
print('Success@10 : {:.4f}'.format(top10_eval))