In [1]:
import warnings
warnings.filterwarnings('ignore')

import os
import argparse
import random
from datetime import datetime
import math
import csv
from csv import DictWriter
from tqdm import tqdm
import pandas as pd
import scipy.sparse as ssp
import numpy as np
import networkx as nx
import ast

from sklearn.preprocessing import normalize
from sklearn import metrics
from sklearn.metrics import (precision_recall_curve, average_precision_score, roc_auc_score,
                              precision_score, recall_score, f1_score)

import torch
from torch.nn import (BCEWithLogitsLoss)
import torch.nn.functional as F
from torch.utils.data import DataLoader

import node2vec2 as n2v
#from node2vec import Node2Vec
from gensim.models import Word2Vec


from torch_geometric.nn import GCNConv, SAGEConv, GAE
from torch_geometric.utils import (negative_sampling)
from torch_geometric.data import Data

from ogb.linkproppred import PygLinkPropPredDataset


In [2]:
def datasets():
    path = "data/%s/%s" %(args.dataset, args.dataset_name)
    
    isExist = os.path.exists(path + "/graph_info.csv")
    
    if not isExist:
        os.makedirs(path)
        
        if args.dataset == 'ogb':
            dataset = PygLinkPropPredDataset(name=args.dataset_name)
            data = dataset[0]
            split_edge = dataset.get_edge_split()
            pos_train_edge = split_edge['train']['edge']
            split_edge['train']['edge_neg'] = negative_sampling(data.edge_index, num_nodes=data.num_nodes,
                num_neg_samples=pos_train_edge.size(0)).T

            edge_weight = torch.ones(data.edge_index.size(1), dtype=int)
            A = ssp.csr_matrix((edge_weight, (data.edge_index[0], data.edge_index[1])), 
                               shape=(data.num_nodes, data.num_nodes))
            n_v = data.num_nodes
            n_e = data.num_edges
            num_subgraph_nodes = math.ceil((2*n_e/n_v)*(1+((2*n_e)/((n_v)*(n_v-1)))))#PLACN
          

        elif args.dataset == 'other':
            
            A, A_train, A_test, data_x, n_v, n_e, train_message_edges, test_message_edges,\
            edge_pos_train, edge_neg_train, edge_pos_test, edge_neg_test = data_loader()


            data_train = Data(x = data_x, edge_index = train_message_edges, num_nodes = n_v)
            data_test = Data(x = data_x, edge_index = test_message_edges, num_nodes = n_v)
            
            num_subgraph_nodes = math.ceil((2*n_e/n_v)*(1+((2*n_e)/((n_v)*(n_v-1)))))#PLACN
            num_subgraph_nodes_primitive = num_subgraph_nodes + math.ceil(0.5*num_subgraph_nodes) + 2
            
            arrayOfA = A.toarray()
            listOfA = arrayOfA.tolist()
            
            arrayOfA_train = A_train.toarray()
            listOfA_train = arrayOfA_train.tolist()
            
            arrayOfA_test = A_test.toarray()
            listOfA_test = arrayOfA_test.tolist()
            
            field_names = ['adj_matrix', 'adj_matrix_train', 'adj_matrix_test', 'data_x', 'num_vertices', 'num_edges',
                           'edge_index', 'train_message_edges', 'test_message_edges', 'edge_pos_train', 'edge_neg_train',
                           'edge_pos_test', 'edge_neg_test','num_subgraph_nodes']
            
            dict = {'adj_matrix':listOfA, 'adj_matrix_train':listOfA_train, 'adj_matrix_test':listOfA_test,
                    'data_x':data_x.tolist(), 'num_vertices':n_v,  'num_edges':n_e,
                    'train_message_edges':train_message_edges.tolist(), 'test_message_edges':test_message_edges.tolist(),
                    'edge_pos_train':edge_pos_train.tolist(),
                    'edge_neg_train':edge_neg_train.tolist(), 'edge_pos_test':edge_pos_test.tolist(),
                    'edge_neg_test':edge_neg_test.tolist(), 'num_subgraph_nodes':num_subgraph_nodes}
            
            with open(path + "/graph_info.csv", mode='w') as f_object:
                dictwriter_object = DictWriter(f_object, fieldnames=field_names)
                dictwriter_object.writeheader()
                dictwriter_object.writerow(dict)
                f_object.close()
              
            
            field_names = ['id1', 'id2']
            
            edges = (torch.tensor(train_message_edges.tolist())).T
            with open (path+'/%s_train_edges.csv'%(args.dataset_name), mode='w') as f_object:
                dictwruter_object = DictWriter(f_object, fieldnames=field_names)
                dictwruter_object.writeheader()
            for i in range(len(edges[0])):
                dict = {'id1':edges[0][i].item(), 'id2':edges[1][i].item()}
                with open (path+'/%s_train_edges.csv'%(args.dataset_name), mode='a') as f_object:
                    dictwruter_object = DictWriter(f_object, fieldnames=field_names)
                    dictwruter_object.writerow(dict)
                    f_object.close()
            
            edges = (torch.tensor(test_message_edges.tolist())).T
            with open (path+'/%s_test_edges.csv'%(args.dataset_name), mode='w') as f_object:
                dictwruter_object = DictWriter(f_object, fieldnames=field_names)
                dictwruter_object.writeheader()
            for i in range(len(edges[0])):
                dict = {'id1':edges[0][i].item(), 'id2':edges[1][i].item()}
                with open (path+'/%s_test_edges.csv'%(args.dataset_name), mode='a') as f_object:
                    dictwruter_object = DictWriter(f_object, fieldnames=field_names)
                    dictwruter_object.writerow(dict)
                    f_object.close()
                
                
        train_edges_x = np.concatenate([edge_pos_train.T[0],edge_neg_train.T[0]])
        train_edges_y = np.concatenate([edge_pos_train.T[1],edge_neg_train.T[1]])
        subgraph_primitive(A_train,train_edges_x , train_edges_y, num_nodes=num_subgraph_nodes_primitive, type_data='train' ) 
        
        test_edges_x = np.concatenate([edge_pos_test.T[0],edge_neg_test.T[0]])
        test_edges_y = np.concatenate([edge_pos_test.T[1],edge_neg_test.T[1]])
        subgraph_primitive(A_test, test_edges_x, test_edges_y, num_nodes=num_subgraph_nodes_primitive, type_data='test' ) 

    else:
        
        if args.dataset == 'ogb':
            
            df = pd.read_csv(path+"/graph_info.csv")
            
        elif args.dataset == 'other':
            
            df = pd.read_csv(path+"/graph_info.csv")
            A = ssp.csr_matrix(ast.literal_eval(df['adj_matrix'].dropna().values[0]))
            A_train = ssp.csr_matrix(ast.literal_eval(df['adj_matrix_train'].dropna().values[0]))
            A_test = ssp.csr_matrix(ast.literal_eval(df['adj_matrix_test'].dropna().values[0]))
            data_x = torch.tensor(ast.literal_eval(df['data_x'].dropna().values[0]))
            train_message_edges = torch.tensor(ast.literal_eval(df['train_message_edges'].dropna().values[0]))
            test_message_edges = torch.tensor(ast.literal_eval(df['test_message_edges'].dropna().values[0]))
            edge_pos_train = torch.tensor(ast.literal_eval(df['edge_pos_train'].dropna().values[0]))
            edge_neg_train = torch.tensor(ast.literal_eval(df['edge_neg_train'].dropna().values[0]))
            edge_pos_test = torch.tensor(ast.literal_eval(df['edge_pos_test'].dropna().values[0]))
            edge_neg_test = torch.tensor(ast.literal_eval(df['edge_neg_test'].dropna().values[0]))
            n_v = df['num_vertices'].dropna().values[0]
            n_e = df['num_edges'].dropna().values[0]
            num_subgraph_nodes = df['num_subgraph_nodes'].dropna().values[0]
            num_nodes_primitive = num_subgraph_nodes + math.ceil(0.5*num_subgraph_nodes) + 2
            
            data_train = Data(x = data_x, edge_index = train_message_edges, num_nodes = n_v)
            data_test = Data(x = data_x, edge_index = test_message_edges, num_nodes = n_v)
            
    if (args.subgraph_type == 'hhop' or args.subgraph_type == 'hhop-DIS'):
        isExist = os.path.exists(path + "/subgraphs_train_features.csv")  
        if not isExist:
            subgraph_features(A, num_subgraph_nodes)
    
    elif args.subgraph_type == 'DIS':
        isExist = os.path.exists(path + "/subgraphs_train_features_dist.csv")  
        if not isExist:
            subgraph_features(A, num_subgraph_nodes)

    sub_feature_train_matrix, sub_adj_train_matrix = subgraph_matrixs(data_train, A_train, num_subgraph_nodes+2, 'train')
    sub_feature_test_matrix, sub_adj_test_matrix = subgraph_matrixs(data_test, A_test, num_subgraph_nodes+2, 'test') 

    print("number of nodes:", n_v)  
    print("number of edges:", n_e) 
    print("number of message edges in train:", len(train_message_edges))
    print("number of message edges in test:", len(test_message_edges))
    print("number of supervision edges in train:", len(edge_pos_train))
    print("number of supervision edges in test:", len(edge_pos_test))
    
    print("number of subgraph nodes :", num_subgraph_nodes)
    

    return(data_train, data_test, A, A_train, A_test, edge_pos_train,
           edge_neg_train, edge_pos_test, edge_neg_test, num_subgraph_nodes,
           sub_feature_train_matrix, sub_adj_train_matrix, 
           sub_feature_test_matrix, sub_adj_test_matrix, num_subgraph_nodes)
            
            
            
            

In [3]:
def data_loader():
    
    network_type = args.network_type
    feature_type = args.feature_type

    
    print("load data...")
    if args.dataset == 'other':
        file_path = "dataset/" + args.dataset_name + ".txt"

        #sample positive
        positive_all = np.loadtxt(file_path, dtype=int, usecols=(0, 1))
        if np.min(positive_all) == 1:
            positive_all -= 1
        np.random.shuffle(positive_all)
        n = int(len(positive_all)*0.54)
        positive = np.asarray(positive_all[:n])
        supervision_edges_pos = positive
        message_edges = np.asarray(positive_all[n:])

        G = nx.Graph() if args.network_type == 0 else nx.DiGraph()
        G.add_edges_from(positive_all)
        print(G)
        nodes_size = len(G.nodes()) #nodes size in the network
        edge_size = len(G.edges())

        # sample negative
        negative_all = list(nx.non_edges(G))
        if np.min(negative_all) == 1:
            negative_all -= 1
        np.random.shuffle(negative_all)
        negative = np.asarray(negative_all[:len(positive)])
        supervision_edges_neg = negative

        test_size = int(len(positive) * args.test_ratio)
        train_pos, test_pos = supervision_edges_pos[:-test_size], supervision_edges_pos[-test_size:]
        train_neg, test_neg = supervision_edges_neg[:-test_size], supervision_edges_neg[-test_size:]

        train_message_edges = message_edges
        test_message_edges = np.concatenate([message_edges, train_pos])

        #adj matrix
        A = np.zeros([nodes_size, nodes_size], dtype=np.uint8)
        A[positive_all[:, 0], positive_all[:, 1]] = 1

        if network_type == 0:
            A[positive_all[:, 1], positive_all[:, 0]] = 1


        A_train = np.zeros([nodes_size, nodes_size], dtype=np.uint8)
        A_train[train_message_edges[:, 0], train_message_edges[:, 1]] = 1


        if network_type == 0:
            A_train[train_message_edges[:, 1], train_message_edges[:, 0]] = 1


        A_test = np.zeros([nodes_size, nodes_size], dtype=np.uint8)
        A_test[test_message_edges[:, 0], test_message_edges[:, 1]] = 1
        if network_type == 0:
            A_test[test_message_edges[:, 1], test_message_edges[:, 0]] = 1

        for i in range(nodes_size):
            A_test[i,i] = 1
            A_train[i,i] = 1
            A[i,i] = 1

        A = ssp.csr_matrix(A)    
        A_test = ssp.csr_matrix(A_test) 
        A_train = ssp.csr_matrix(A_train)

        path = "data/%s/%s" %(args.dataset, args.dataset_name)
        #nodes feature
        if feature_type == "node2vec":
            embed = node2vec(A)

            data_x = torch.tensor(embed)

        elif feature_type == "onehot":
            data_x = torch.diag(torch.ones(nodes_size))#ONE_HOT

            
    return(A ,A_train, A_test, data_x, nodes_size, edge_size,
               torch.from_numpy(train_message_edges), torch.from_numpy(test_message_edges), 
               torch.from_numpy(train_pos), torch.from_numpy(train_neg),
              torch.from_numpy(test_pos), torch.from_numpy(test_neg))



In [4]:
def subgraph_features(A, num_subgraph_nodes):
    path = "data/%s/%s" %(args.dataset, args.dataset_name)
     
    embed = []
    
    if args.feature_type == 'node2vec':
        df = pd.read_csv(path+'/%s_node2vec.csv'%(args.dataset_name))
        f = df['node2vec'].dropna().values
        
        for i in f:
            embed.append(torch.tensor(ast.literal_eval(i)))
            
    elif args.feature_type == 'onehot':
        embed = torch.diag(torch.ones(nodes_size))

    G = nx.from_scipy_sparse_array(A)
    
    num_nodes = num_subgraph_nodes
    

    #****************************train*******************************
    df = pd.read_csv(path +'/subgraphs_train_info.csv')
    nodes1 = df['nodes1'].dropna().values
    nodes2 = df['nodes2'].dropna().values
    
    feature_vectors = []
    subgraphs_A = []
    subgraphs_n2v = []
    subgraphs_nodes = []

            
    if (args.subgraph_type == 'hhop'or args.subgraph_type == 'hhop-DIS'):
        path2 = path+'/subgraphs_train_features.csv'
        
        f = df['subgraph_nodes'].dropna().values
        for i in f:
            subgraphs_nodes.append((ast.literal_eval(i)))
        
        for i in subgraphs_nodes:
            a=[]
            for j in i:
                a.append(embed[j].tolist())
            subgraphs_n2v.append(a)
        
    elif args.subgraph_type == 'DIS':
        path2 = path+'/subgraphs_train_features_dist.csv'
        edges = torch.cat([torch.tensor([nodes1]), torch.tensor([nodes2])], dim=0).T
        subgraphs_nodes = []
        for index in range(len(edges)):
            dis = []
            dataf = []

            embed_node1 = embed[edges[index][0].item()]
            embed_node2 = embed[edges[index][1].item()]
            embed_node_index = ((embed_node1+embed_node2)/2)


            d_list = []
            dataf = []

            if args.dist_type == 'cos':
                cos = torch.nn.CosineSimilarity(dim=0)


            for i in range (len(embed)):
                d=0
                if i != edges[index][0]  and i != edges[index][1]:
                    if args.dist_type == 'norm':
                        d = (torch.norm((embed[i])-(embed_node_index))).item()

                    elif args.dist_type == 'cos':
                        d = (1-(cos(embed[i], embed_node_index).item()))

                else:
                    d = 0

                d_list.append(d)
                dataf.append([i,embed[i], d])

            df = pd.DataFrame(dataf, columns=['node', 'embed', 'distance'])
            df = df.sort_values('distance', ascending=True)

            sub_nodes = []
            nodes = ((df['node'].dropna().values)[:(num_subgraph_nodes+2)])
            
            for n in nodes:
                sub_nodes.append(n)
            sub_embed = torch.tensor(np.array([t.numpy() for t in (df['embed'].dropna().values)[:(num_subgraph_nodes+2)]]))
            subgraphs_n2v.append(sub_embed)
            subgraphs_nodes.append(sub_nodes)
            
        

    #for subgraph
    for subgraph_nodes in subgraphs_nodes:
        
        SA = (A[subgraph_nodes, :][:, subgraph_nodes]).toarray()
        SA[0][1]=0.0
        SA[1][0]=0.0

        listOfsubgraphs_A = SA.tolist()
        subgraphs_A.append(listOfsubgraphs_A)




    #save to csv    

    field_names = ['node1', 'node2', 'sub_nodes' , 'sub_A', 'sub_node2vec_matrix']
    with open (path2, mode='w') as f_object:
        dictwruter_object = DictWriter(f_object, fieldnames=field_names)
        dictwruter_object.writeheader()

    for i in range(len(nodes1)):
        if (args.subgraph_type == 'hhop'or args.subgraph_type == 'hhop-DIS'):
            dict = {'node1':nodes1[i], 'node2':nodes2[i], 'sub_nodes':subgraphs_nodes[i],
                    'sub_A':subgraphs_A[i], 'sub_node2vec_matrix':subgraphs_n2v[i]}
        elif args.subgraph_type == 'DIS':
            dict = {'node1':nodes1[i], 'node2':nodes2[i], 'sub_nodes':subgraphs_nodes[i],
                    'sub_A':subgraphs_A[i], 'sub_node2vec_matrix':subgraphs_n2v[i].tolist()}

        with open (path2, mode='a') as f_object:
            dictwruter_object = DictWriter(f_object, fieldnames=field_names)
            dictwruter_object.writerow(dict)
            f_object.close()

    #*******************************test******************
    df = pd.read_csv(path +'/subgraphs_test_info.csv')
    nodes1 = df['nodes1'].dropna().values
    nodes2 = df['nodes2'].dropna().values
    
    feature_vectors = []
    subgraphs_A = []
    subgraphs_n2v = []
    subgraphs_motif = []
    subgraphs_RD = []
    motif_matrix=[]
            
    if (args.subgraph_type == 'hhop'or args.subgraph_type == 'hhop-DIS'):
        path2 = path+'/subgraphs_test_features.csv'
        f = df['subgraph_nodes'].dropna().values
        for i in f:
            subgraphs_nodes.append((ast.literal_eval(i)))
        
        for i in subgraphs_nodes:
            a=[]
            for j in i:
                a.append(embed[j].tolist())
            subgraphs_n2v.append(a)

    elif args.subgraph_type == 'DIS':
        path2 = path+'/subgraphs_test_features_dist.csv'
        edges = torch.cat([torch.tensor([nodes1]), torch.tensor([nodes2])], dim=0).T
        
        subgraphs_nodes = []
        for index in range(len(edges)):
            dis = []
            dataf = []
            
            embed_node1 = embed[edges[index][0].item()]
            embed_node2 = embed[edges[index][1].item()]
            embed_node_index = ((embed_node1+embed_node2)/2)


            d_list = []
            dataf = []
            

            if args.dist_type == 'cos':
                cos = torch.nn.CosineSimilarity(dim=0)


            for i in range (len(embed)):
                
                if (i != edges[index][0]  or i != edges[index][1]):
                    
                    if args.dist_type == 'norm':
                        d = (torch.norm((embed[i])-(embed_node_index))).item()

                    elif args.dist_type == 'cos':
                        d = (1-(cos(embed[i], embed_node_index).item()))

                else:
                    d = 0

                d_list.append(d)
                dataf.append([i,embed[i], d])
            df = pd.DataFrame(dataf, columns=['node', 'embed', 'distance'])
            df = df.sort_values('distance', ascending=True)
            
            sub_nodes = []
            nodes = ((df['node'].dropna().values)[:(num_subgraph_nodes+2)])
            
            for n in nodes:
                sub_nodes.append(n) 
            
            sub_embed = torch.tensor(np.array([t.numpy() for t in (df['embed'].dropna().values)[:(num_subgraph_nodes+2)]]))
            subgraphs_n2v.append(sub_embed)
            subgraphs_nodes.append(sub_nodes)
            



    #A for subgraph
    for subgraph_nodes in subgraphs_nodes:
        
        SA = (A[subgraph_nodes, :][:, subgraph_nodes]).toarray()
        SA[0][1]=0.0
        SA[1][0]=0.0

        listOfsubgraphs_A = SA.tolist()
        subgraphs_A.append(listOfsubgraphs_A)



    #save to csv    

    field_names = ['node1', 'node2', 'sub_nodes' , 'sub_A', 'sub_node2vec_matrix']
    with open (path2, mode='w') as f_object:
        dictwruter_object = DictWriter(f_object, fieldnames=field_names)
        dictwruter_object.writeheader()

    for i in range(len(nodes1)):
        if (args.subgraph_type == 'hhop'or args.subgraph_type == 'hhop-DIS'):
            dict = {'node1':nodes1[i], 'node2':nodes2[i], 'sub_nodes':subgraphs_nodes[i],
                    'sub_A':subgraphs_A[i], 'sub_node2vec_matrix':subgraphs_n2v[i]}
        elif args.subgraph_type == 'DIS':
            dict = {'node1':nodes1[i], 'node2':nodes2[i], 'sub_nodes':subgraphs_nodes[i],
                    'sub_A':subgraphs_A[i], 'sub_node2vec_matrix':subgraphs_n2v[i].tolist()}

        with open (path2, mode='a') as f_object:
            dictwruter_object = DictWriter(f_object, fieldnames=field_names)
            dictwruter_object.writerow(dict)
            f_object.close()

In [5]:

def subgraph_matrixs(data , A, num_nodes, type_sub):
    
    path = "data/%s/%s" %(args.dataset, args.dataset_name)
    embed = []
    sub_nodes = []
    adj = []
    embed_matrix = []
    
    if args.subgraph_type == 'DIS':
        df2 = pd.read_csv(path+ "/subgraphs_%s_features_dist.csv" %(type_sub))
        
        if args.feature_type== 'node2vec':
            
            f = df2['sub_nodes'].dropna().values
            for i in f:
                sub_nodes.append(torch.tensor(ast.literal_eval(i)))
            
            embed_matrix = torch.zeros([len(sub_nodes),num_nodes,args.n2v_dim])  
            f = df2['sub_node2vec_matrix'].dropna().values
            for index in range(len(f)):
                e = torch.tensor(ast.literal_eval(f[index]))
                if len(torch.tensor(ast.literal_eval(f[index])))< num_nodes:
                    x = torch.zeros([(num_nodes-len(e)), args.n2v_dim])
                    e = torch.cat([e,x])
                embed_matrix[index] = (e)
                
            adj_matrix = torch.zeros([len(sub_nodes),num_nodes,num_nodes])    
            f = df2['sub_A'].dropna().values
            for index in range(len(f)):    
                matrix = (torch.tensor(ast.literal_eval(f[index])))
                
                if len(matrix)<num_nodes:
                    x = torch.zeros([(num_nodes-len(matrix)), num_nodes])
                    y = torch.zeros([len(matrix), (num_nodes-len(matrix))])
                    matrix= torch.cat((matrix,y),dim=1)
                    matrix = torch.cat((matrix,x),dim=0)
                adj_matrix[index] = (matrix)

    elif args.subgraph_type == 'hhop-DIS':
        num_nodes = num_nodes-2
        num_nodes = num_nodes + math.ceil(0.5*num_nodes) + 2
        
        df2 = pd.read_csv(path+ "/subgraphs_%s_features.csv" %(type_sub))
         
        if args.feature_type== 'node2vec':

            f = df2['sub_nodes'].dropna().values
            for i in f:
                sub_nodes.append(torch.tensor(ast.literal_eval(i)))
            
            embed_matrix = torch.zeros([len(sub_nodes),num_nodes,args.n2v_dim]) 
            f = df2['sub_node2vec_matrix'].dropna().values
            for index in range(len(f)):
                e = torch.tensor(ast.literal_eval(f[index]))
                if len(torch.tensor(ast.literal_eval(f[index])))< num_nodes:
                    x = torch.zeros([(num_nodes-len(e)), args.n2v_dim])
                    e = torch.cat([e,x])
                embed_matrix[index] = (e)
            
            adj_matrix = torch.zeros([len(sub_nodes),num_nodes,num_nodes])    
            f = df2['sub_A'].dropna().values
            for index in range(len(f)):    
                
                matrix = (torch.tensor(ast.literal_eval(f[index])))
                
                if len(matrix)<num_nodes:
                    x = torch.zeros([(num_nodes-len(matrix)), num_nodes])
                    y = torch.zeros([len(matrix), (num_nodes-len(matrix))])
                    matrix= torch.cat((matrix,y),dim=1)
                    matrix = torch.cat((matrix,x),dim=0)
                adj_matrix[index] = (matrix)
            
                
    elif args.subgraph_type == 'hhop':
        df2 = pd.read_csv(path+ "/subgraphs_%s_features.csv" %(type_sub))
        
        if args.feature_type== 'node2vec':

            f = df2['sub_nodes'].dropna().values
            for i in f:
                sub_nodes.append(torch.tensor(ast.literal_eval(i))[:num_nodes])
            
            embed_matrix = torch.zeros([len(sub_nodes),num_nodes,args.n2v_dim]) 
            f = df2['sub_node2vec_matrix'].dropna().values
            for index in range(len(f)):
                e = torch.tensor(ast.literal_eval(f[index]))
                if len(torch.tensor(ast.literal_eval(f[index])))< num_nodes:
                    x = torch.zeros([(num_nodes-len(e)), args.n2v_dim])
                    e = torch.cat([e,x])
                embed_matrix[index] = (e)[:num_nodes]
            
            adj_matrix = torch.zeros([len(sub_nodes),num_nodes,num_nodes]) 
            f = df2['sub_A'].dropna().values
            for index in range(len(f)):
                matrix = (torch.tensor(ast.literal_eval(f[index])))
                
                if len(matrix)<num_nodes:
                    x = torch.zeros([(num_nodes-len(matrix)), num_nodes])
                    y = torch.zeros([len(matrix), (num_nodes-len(matrix))])
                    matrix= torch.cat((matrix,y),dim=1)
                    matrix = torch.cat((matrix,x),dim=0)
                        
                a = []
                for j in range(num_nodes):
                    a.append((matrix[j][:num_nodes]).tolist())
                
                adj_matrix[index] = (torch.tensor(a))
    
    sub_nodes = np.array(sub_nodes)




                
    return(embed_matrix, adj_matrix)
        

In [6]:

class Logger(object):
    def __init__(self, runs, info=None):
        self.info = info
        self.results = [[] for _ in range(runs)]

    def add_result(self, run, result):
        assert len(result) == 3
        assert run >= 0 and run < len(self.results)
        self.results[run].append(result)

    def print_statistics(self, run=None):
        if run is not None:
            result = 100 * torch.tensor(self.results[run])
            argmax = result[:, 1].argmax().item()
            print(f'Run {run + 1:02d}:')
            print(f'Highest Train: {result[:, 0].max():.2f}')
            print(f'Highest Valid: {result[:, 1].max():.2f}')
            print(f'  Final Train: {result[argmax, 0]:.2f}')
            print(f'   Final Test: {result[argmax, 2]:.2f}')
        else:
            result = 100 * torch.tensor(self.results)

            best_results = []
            for r in result:
                train1 = r[:, 0].max().item()
                valid = r[:, 1].max().item()
                train2 = r[r[:, 1].argmax(), 0].item()
                test = r[r[:, 1].argmax(), 2].item()
                best_results.append((train1, valid, train2, test))

            best_result = torch.tensor(best_results)

            print(f'All runs:')
            r = best_result[:, 0]
            print(f'Highest Train: {r.mean():.2f} ± {r.std():.2f}')
            r = best_result[:, 1]
            print(f'Highest Valid: {r.mean():.2f} ± {r.std():.2f}')
            r = best_result[:, 2]
            print(f'  Final Train: {r.mean():.2f} ± {r.std():.2f}')
            r = best_result[:, 3]
            print(f'   Final Test: {r.mean():.2f} ± {r.std():.2f}')

In [7]:
def shortest_path_len(G,i,j):
    try:
        n=nx.shortest_path_length(G,i,j)
    except nx.NetworkXNoPath:
        n = len(G)+1

    return(n)

In [8]:
def get_link_labels(pos_edge_index, neg_edge_index):
    E = pos_edge_index.size(1) + neg_edge_index.size(1)
    link_labels = torch.zeros(E, dtype=torch.float)
    link_labels[:pos_edge_index.size(1)] = 1.
    return link_labels

In [9]:
def accuracy(pred, label):

    accu = 0.0

    pred_label = pred.ge(0.5).to(pred.device)
    accu = torch.sum(pred_label == label) / label.shape[0]
    accu = accu.item()

    return round(accu,4)

In [10]:
def recall(pred, label):

    pred_label = pred.ge(0.5).to(pred.device)
    recall = recall_score(label, pred_label)

    return round(recall,4)

In [11]:
def precision(pred, label):

    pred_label = pred.ge(0.5).to(pred.device)
    precision = precision_score(label, pred_label)

    return round(precision,4)

In [12]:
def f1(pred, label):

    pred_label = pred.ge(0.5).to(pred.device) 
    f1 = f1_score(label, pred_label)

    return round(f1,4)

In [13]:
def precision_recall_c(pred, label):
    
    precision, recall, _ = precision_recall_curve(label, pred)
    pyplot.plot(recall, precision, marker='.', label='Logistic')

    pyplot.xlabel('Recall')
    pyplot.ylabel('Precision')

    pyplot.legend()

    pyplot.show()


In [14]:
def precision_recall(pred, label):
    
    precision, recall, _ = precision_recall_curve(label, pred)
    
    return (precision, recall)


In [15]:
def precision_recall_AUC(pred, label):
    
    precision, recall, _ = precision_recall_curve(label, pred)
    
    pr_auc = metrics.auc(recall, precision)
    
    return round(pr_auc,4)

In [16]:
def auc(pred, label):
    
    fpr, tpr, thresholds = metrics.roc_curve(label, pred)
    return(metrics.auc(fpr, tpr))

In [17]:
def roc(pred, label):
    
    auc = roc_auc_score(label, pred)

    return round(auc,4)

In [18]:
def average_precision(pred, label):
    
    avg_precision = average_precision_score(label, pred)
    
    return round(avg_precision,4)

In [19]:
def generate_node2vec_embeddings(A, emd_size=128, negative_injection=False, train_neg=None):
    if negative_injection:
        row, col = train_neg
        A = A.copy()
        A[row, col] = 1  # inject negative train
        A[col, row] = 1  # inject negative train
    nx_G = nx.from_scipy_sparse_array(A)
    G = n2v.Graph(nx_G, is_directed=False, p=1, q=1)
    G.preprocess_transition_probs()
    walks = G.simulate_walks(num_walks=10, walk_length=80)
    walks = [list(map(str, walk)) for walk in walks]
    model = Word2Vec(walks, vector_size=emd_size, window=10, min_count=0, sg=1, 
            workers=8, epochs=1)
    wv = model.wv
    embeddings = np.zeros([A.shape[0], emd_size], dtype='float32')
    sum_embeddings = 0
    empty_list = []
    for i in range(A.shape[0]):
        if str(i) in wv:
            embeddings[i] = wv.word_vec(str(i))
            sum_embeddings += embeddings[i]
        else:
            empty_list.append(i)
    mean_embedding = sum_embeddings / (A.shape[0] - len(empty_list))
    embeddings[empty_list] = mean_embedding
    return embeddings

In [20]:
def node2vec(A):
    embed = generate_node2vec_embeddings(A, args.n2v_dim)
    data_x = torch.tensor(np.array(embed))
    
    path = "data/%s/%s" %(args.dataset, args.dataset_name)
    
    field_names = ['node2vec']
    with open (path+'/%s_node2vec.csv'%(args.dataset_name), mode='w') as f_object:
        dictwruter_object = DictWriter(f_object, fieldnames=field_names)
        dictwruter_object.writeheader()
    
    for i in range(len(data_x)):
        dict = {'node2vec':(data_x[i]).tolist()}

        with open (path+'/%s_node2vec.csv'%(args.dataset_name), mode='a') as f_object:
            dictwruter_object = DictWriter(f_object, fieldnames=field_names)
            dictwruter_object.writerow(dict)
            f_object.close()
    return(embed)

In [21]:

def neighbors(fringe, A, outgoing=True):

    if outgoing:
        res = (set(A[list(fringe)].indices))
    else:
        res = set(A[:, list(fringe)].indices)

    return res

In [22]:
def k_hop_subgraph(num_hops, A, src, dst=None, num_nodes=0 ,sample_ratio=1.0, 
                   max_nodes_per_hop=None, node_features=None, 
                   y=1, directed=False, A_csc=None):
    # Extract the k-hop enclosing subgraph around link (src, dst) from A. 
    if dst == None:
        src = src.item()
        nodes = [src]
        visited = set([src])
        fringe = set([src])
        for dist in range(1, num_hops+1):
            if not directed:
                fringe = neighbors(fringe, A)
            else:
                out_neighbors = neighbors(fringe, A)
                in_neighbors = neighbors(fringe, A_csc, False)
                fringe = out_neighbors.union(in_neighbors)
            fringe = fringe - visited
            visited = visited.union(fringe)
            if sample_ratio < 1.0:
                fringe = random.sample(fringe, int(sample_ratio*len(fringe)))
            if max_nodes_per_hop is not None:
                if max_nodes_per_hop < len(fringe):
                    fringe = random.sample(fringe, max_nodes_per_hop)
            if len(fringe) == 0:
                break
            if num_nodes != 0:
                if len(nodes + list(fringe)) > num_nodes:
                    x = len(nodes + list(fringe)) - num_nodes
                    nodes = nodes + list(fringe)[:-x]
                    break
                    
                if len(nodes) == num_nodes:
                    break
                    
            nodes = nodes + list(fringe)
            
            
        subgraph = A[nodes, :][:, nodes]

        # Remove target link between the subgraph.
        subgraph[0, 1] = 0
        subgraph[1, 0] = 0

        if node_features is not None:
            node_features = node_features[nodes]
        
    
    else:
        src = src.item()
        dst = dst.item()
        nodes = [src, dst]
        visited = set([src, dst])
        fringe = set([src, dst])
        for dist in range(1, num_hops+1):
            if not directed:
                fringe = neighbors(fringe, A)
            else:
                out_neighbors = neighbors(fringe, A)
                in_neighbors = neighbors(fringe, A_csc, False)
                fringe = out_neighbors.union(in_neighbors)
            fringe = fringe - visited
            visited = visited.union(fringe)
            if sample_ratio < 1.0:
                fringe = random.sample(fringe, int(sample_ratio*len(fringe)))
            if max_nodes_per_hop is not None:
                if max_nodes_per_hop < len(fringe):
                    fringe = random.sample(fringe, max_nodes_per_hop)
            if len(fringe) == 0:
                break
                
            if num_nodes != 0:
                if len(nodes + list(fringe)) > num_nodes:
                    x = len(nodes + list(fringe)) - num_nodes
                    nodes = nodes + list(fringe)[:-x]
                    break
                    
                if len(nodes) == num_nodes:
                    break
                    
            nodes = nodes + list(fringe)
            
        subgraph = A[nodes, :][:, nodes]

        # Remove target link between the subgraph.
        subgraph[0, 1] = 0
        subgraph[1, 0] = 0

        if node_features is not None:
            node_features = node_features[nodes]

    return nodes, subgraph


In [23]:
def subgraph_primitive(A, nodes1, nodes2, num_nodes, type_data ):
    
    max_hop= args.max_hop
    
    path = "data/%s/%s" %(args.dataset, args.dataset_name)

    
    field_names = ['nodes1', 'nodes2', 'subgraph_nodes', 'subgraph_A']
    
    dict={}
    for index in range(len(nodes1)): 
        isExist = os.path.exists(path+'/subgraphs_%s_info.csv'%(type_data))
        
        subgraph_nodes, subgraph_A = k_hop_subgraph(max_hop, A, nodes1[index], nodes2[index], num_nodes=num_nodes)
        
        dict = {'nodes1':subgraph_nodes[0], 'nodes2':subgraph_nodes[1], 'subgraph_nodes':subgraph_nodes, 'subgraph_A':subgraph_A}
        if not isExist:
            with open(path+'/subgraphs_%s_info.csv'%(type_data), mode='w') as f_object:
                dictwriter_object = DictWriter(f_object, fieldnames=field_names)
                dictwriter_object.writeheader()
                dictwriter_object.writerow(dict)
                f_object.close()
        else:
            with open(path+'/subgraphs_%s_info.csv'%(type_data), mode='a') as f_object:
                dictwriter_object = DictWriter(f_object, fieldnames=field_names)
                dictwriter_object.writerow(dict)
                f_object.close()
          



In [24]:
def subgraph2vec(A, embed, num_nodes):
    
    z_embed = torch.tensor(()).to(embed.device)
    
    matrix_embed = torch.zeros([len(embed),num_nodes+2,len(embed[0][0])])

    if args.dist_type == 'cos':
        cos = torch.nn.CosineSimilarity(dim=0)

    for index in range(len(embed)):
        dis = []


        embed_node1 = (embed[index][0])
        embed_node2 = (embed[index][1])
        embed_node_index = ((embed_node1+embed_node2)/2)



        d_list = []
        dataf = []
        for i in range (len(embed[index])):

            if i != 0 and i != 1:

                if args.dist_type == 'norm':
                    d_list.append((torch.norm((embed[index][i])-(embed_node_index))).item())

                elif args.dist_type == 'cos':
                    d_list.append(1-(cos(embed[index][i], embed_node_index).item()))
            else:
                d_list.append(0)

        d_list = torch.tensor((normalize([d_list])[0]).tolist())


        for i in range (len(embed[index])):
            dataf.append([embed[index][i], d_list[i]])



        df = pd.DataFrame(dataf, columns=['embed', 'distance'])
        df = df.sort_values('distance', ascending=True)


        subgraph_embed = df['embed'].dropna().values[:(num_nodes+2)]
        distances = df['distance'].dropna().values[:(num_nodes+2)]


        if args.subgraph_feature_type == 'NDP':

            w_list = torch.zeros(len(distances))

            for i in range(len(distances)):
                d = distances[i].item()
                if d<=0:
                    w_list[i] = math.log((1-0.0001)/0.0001)
                elif(d>=1):
                    w_list[i] = math.log((1-0.9999)/0.9999)
                else:
                    w_list[i] = math.log((1-d)/d) #adaboost
            w_list = w_list / (w_list.sum())

            for i in range(len(distances)):
                subgraph_embed[i] = subgraph_embed[i]*w_list[i]

            s = (subgraph_embed.sum())

            s = s.reshape([1,len(s)])
            z_embed = torch.cat((z_embed,s), 0)



        elif args.subgraph_feature_type == 'CNN':
            matrix = torch.tensor(np.array([t.detach().numpy() for t in subgraph_embed]))

            if len(matrix)< num_nodes+2:
                x = torch.zeros([(num_nodes+2-len(matrix)), len(matrix[0])])
                matrix = torch.cat([matrix,x])


            matrix_embed[index] = matrix
            z_embed = matrix_embed
                
    
    return(z_embed)
        

In [25]:
def adj_dense(subgraph_A ):
    A_sub = torch.tensor(())
    coo =subgraph_A.tocoo()
    values = coo.data
    indices = np.vstack((coo.row, coo.col))
    i = torch.LongTensor(indices)
    v = torch.FloatTensor(values)
    shape = coo.shape

    A_sub_coo = torch.sparse.FloatTensor(i, v, torch.Size(shape)).to_dense().to(A_sub.device)
    A_sub_coo= A_sub_coo.reshape([1,len(A_sub_coo),len(A_sub_coo[0])])
    A_sub = torch.cat((A_sub, A_sub_coo),0)
    return(A_sub.squeeze())

In [26]:
def adj_to_edge_index(adj):
    edge_index = adj.nonzero().t().contiguous()
    return(edge_index)

In [27]:
class GCN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, num_layers, dropout):
        
        super(GCN, self).__init__()
        
        self.convs = torch.nn.ModuleList()
        self.convs.append(GCNConv(in_channels, hidden_channels))
        for _ in range(num_layers - 2):
            self.convs.append(GCNConv(hidden_channels, hidden_channels))
        self.convs.append(GCNConv(hidden_channels, out_channels))
        
        self.dropout = dropout
        
    def reset_parameters(self):
        for conv in self.convs:
            conv.reset_parameters()
            
    def forward(self, embed, adj):
        
        output = torch.tensor([]).to(adj.device)
        for i in range(adj.shape[0]):
            edge_index = adj[i].nonzero().t().contiguous()
            x = embed[i]
            for conv in self.convs[:-1]:
                x = conv(x, edge_index)
                x = F.relu(x)
                x = F.dropout(x, p=self.dropout, training=self.training)
            x = self.convs[-1](x, edge_index)
            x = x.reshape([1,len(x),len(x[0])])
            output = torch.cat([output,x],0)
        #print(x)
        #print('end encoder')
        return output
        

In [28]:

class SAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, num_layers,
                 dropout):
        super(SAGE, self).__init__()

        self.convs = torch.nn.ModuleList()
        self.convs.append(SAGEConv(in_channels, hidden_channels))
        for _ in range(num_layers - 2):
            self.convs.append(SAGEConv(hidden_channels, hidden_channels))
        self.convs.append(SAGEConv(hidden_channels, out_channels))

        self.dropout = dropout

    def reset_parameters(self):
        for conv in self.convs:
            conv.reset_parameters()

    def forward(self, x, adj_t):
        for conv in self.convs[:-1]:
            x = conv(x, adj_t)
            x = F.relu(x)
            x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.convs[-1](x, adj_t)
        return x



In [29]:

class GAutoEncoder(torch.nn.Module):
    def __init__(self, GNN_in_channels, GNN_hidden_channels, GNN_out_channels,
                 GNN_num_layers, dropout):

        super(GAutoEncoder, self).__init__()

        self.encoders = torch.nn.ModuleList()
        self.encoders.append(GCNConv(GNN_in_channels, GNN_hidden_channels))
        for _ in range(GNN_num_layers - 2):
            self.encoders.append(GCNConv(GNN_hidden_channels, GNN_hidden_channels))
        self.encoders.append(GCNConv(GNN_hidden_channels, GNN_out_channels))
        

        self.dropout = dropout

    def reset_parameters(self):
        for encoder in self.encoders:
            encoder.reset_parameters()

    def forward(self, x, adj):
        for encoder in self.encoders[:-1]:
            x = encoder(x, adj)
            x = F.relu(x)
            x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.encoders[-1](x, adj) 
        
        return x


  
     

In [30]:
def predictor_simple(z, edge_index):
    z1 = (z[edge_index[0].long()])
    z2 = (z[edge_index[1].long()])
    logits = (z1 * z2).sum(dim=-1)
    return logits

In [31]:
class predictor_model(torch.nn.Module):
    def __init__(self, linear_in_channels, linear_hidden_channels, linear_num_layers, dropout):
        
        super(predictor_model, self).__init__()        
        
        self.lins = torch.nn.ModuleList()
        self.lins.append(torch.nn.Linear(3*linear_in_channels, 128))
        
        self.lins.append(torch.nn.Linear(128, linear_hidden_channels))
        self.lins.append(torch.nn.Linear(linear_hidden_channels, linear_hidden_channels))
        self.lins.append(torch.nn.Linear(linear_hidden_channels, 16))
        
        self.lins.append(torch.nn.Linear(16, 1))
        
        self.dropout = dropout
        
    def reset_parameters(self):
        for lin in self.lins:
            lin.reset_parameters()
            
    def forward(self, z):
        results = torch.zeros(len(z), dtype=torch.float).to(z.device)
        for lin in self.lins[:-1]:
            z = lin(z)
            z = F.relu(z)
            z = F.dropout(z, p=self.dropout, training=self.training)
        z = self.lins[-1](z)
        result = z

        index = 0
        for r in result:
            results[index] = r[0] 
            index += 1

        return results



In [32]:
class subg2vec_model(torch.nn.Module):
    def __init__(self, input_num, feture_dim, hidden_channels, out_channels, dropout):
        
        super(subg2vec_model, self).__init__()        
        
        self.conv1 = torch.nn.Conv2d(1, 8, (3,feture_dim), padding=1)
        self.pool = torch.nn.MaxPool2d(2, 2)
        self.conv2 = torch.nn.Conv2d(8, 16, (3,2), padding=1)
        
        input_num = input_num+2
        
        x = math.ceil((math.ceil((input_num-3+2)/2)-3+2)/2)
        y = math.ceil((math.ceil((feture_dim-2+2)/2)-2+2)/2)

        self.lin1 = torch.nn.Linear(int(0.5*x*y), hidden_channels)
        self.lin2 = torch.nn.Linear(hidden_channels, out_channels)
        

        
        self.dropout = dropout
        
    def reset_parameters(self):
        self.conv1.reset_parameters()
        self.conv2.reset_parameters()
        self.lin1.reset_parameters()
        self.lin2.reset_parameters()
            
    def forward(self, x):
        x = x.unsqueeze(1)
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = torch.flatten(x,1) # flatten all dimensions except batch
        x = F.relu(self.lin1(x))
        x = self.lin2(x)
        return (x.squeeze(1))


In [33]:

def train( model, model_predictor, data, A, data_split,  train_subgraph_adj, train_subgraph_features,
          optimizer, optimizer_predictor, batch_size, num_subgraph_nodes, model_subgraph=None, optimizer_subgraph=None):


    model.train()
    model_predictor.train()
    
    if args.subgraph_feature_type == 'CNN':
        model_subgraph.train()
    
    pos_train_edge = data_split['train']['edge'].to(data.x.device)
    neg_train_edge = data_split['train']['edge_neg'].to(data.x.device)
    
    train_edge = (torch.cat((pos_train_edge, neg_train_edge), dim=0))
    link_labels = (get_link_labels(pos_train_edge.T, neg_train_edge.T))
    
    
    optimizer.zero_grad()
    optimizer_predictor.zero_grad()
    if args.subgraph_feature_type == 'CNN':
        optimizer_subgraph.zero_grad()

    
    total_examples = 0
    total_loss = 0
    i = 0
    
    train_loader = DataLoader(range(len(train_edge)), batch_size, shuffle=True)
    pbar = tqdm(train_loader, ncols=70)
    for perm in pbar:
        
        i += 1
        sub_adj = train_subgraph_adj[perm].to(data.x.device)
        sub_x = train_subgraph_features[perm].to(data.x.device)
        labels = link_labels[perm].to(data.x.device)
        
        
        if args.GNN_type == 'autoencoder':
            z = model.encode(sub_x, sub_adj)
        else:
            z = model(sub_x, sub_adj)
        
        z1 = (z[: ,0])
        z2 = (z[: ,1])
        
        z_nodes = torch.cat([z1,z2], dim=-1)
        
        
        if args.subgraph_feature_type == 'CNN':
            sub_embed = subgraph2vec(A, z, num_subgraph_nodes ).to(data.x.device)
            z_sub = model_subgraph(sub_embed)
            z_embed = torch.cat((z_nodes,z_sub),-1)


        elif args.subgraph_feature_type == 'NDP':
            z_sub = subgraph2vec(A, z, num_subgraph_nodes ).to(data.x.device)
            z_embed = torch.cat((z_nodes,z_sub),-1)
        
        link_logits = model_predictor(z_embed).to(data.x.device) # predict with NN
        

      
        if args.GNN_type == 'autoencoder':
            loss_autoencoder = model.recon_loss(z, data.edge_index.T)
            loss_predict = BCEWithLogitsLoss()(link_logits, link_labels)
            loss = loss_autoencoder+loss_predict
            loss.backward()

        else:
            loss = BCEWithLogitsLoss()(link_logits, labels)
            loss.backward()

        


        torch.nn.utils.clip_grad_norm_(model.parameters(), 2.0)
        torch.nn.utils.clip_grad_norm_(model_predictor.parameters(), 2.0)
        if args.subgraph_feature_type == 'CNN':
            torch.nn.utils.clip_grad_norm_(model_subgraph.parameters(), 2.0)

        num_examples = (link_logits.size(0))
        total_loss += loss.item() * num_examples
        total_examples += num_examples


        optimizer.step()
        optimizer_predictor.step()
        if args.subgraph_feature_type == 'CNN':
            optimizer_subgraph.step()

    return (total_loss/ total_examples)



In [34]:
@torch.no_grad()
def test(model, model_predictor,  data_train, A_train, data_test, A_test, split_edge, train_subgraph_adj, train_subgraph_features,
         test_subgraph_adj, test_subgraph_features, batch_size, num_subgraph_nodes,
         model_subgraph=None, optimizer_subgraph=None, evaluator=False):
    
    

    model.eval()
    model_predictor.eval()
    if args.subgraph_feature_type == 'CNN':
        model_subgraph.eval()


    pos_train_edge = split_edge['train']['edge'].T.to(data_train.x.device)
    neg_train_edge = split_edge['train']['edge_neg'].T.to(data_train.x.device)
    pos_test_edge = split_edge['test']['edge'].T.to(data_train.x.device)
    neg_test_edge = split_edge['test']['edge_neg'].T.to(data_train.x.device)

    train_preds = []
    train_labels = (get_link_labels(pos_train_edge, neg_train_edge)).to(data_train.x.device)
    train_edge = torch.cat((pos_train_edge, neg_train_edge), dim=1)
    
    
    for perm in DataLoader(range(train_edge.size(1)), batch_size, shuffle=False):
        
        sub_adj = train_subgraph_adj[perm].to(data_train.x.device)
        sub_x = train_subgraph_features[perm].to(data_train.x.device)
        
        if args.GNN_type == 'autoencoder':
            z = model.encode(sub_x, sub_adj)
        else:
            z = model(sub_x, sub_adj)
            
        z1 = (z[: ,0])
        z2 = (z[: ,1])
        
        z_nodes = torch.cat([z1,z2], dim=-1)
        
       
        if args.subgraph_feature_type == 'CNN':
            sub_embed = subgraph2vec(A_train, z, num_subgraph_nodes ).to(data_train.x.device)
            z_sub = model_subgraph(sub_embed)
            z_embed = torch.cat((z_nodes,z_sub),-1)
            
        
        elif args.subgraph_feature_type == 'NDP':
            z_sub = subgraph2vec(A_train, z, num_subgraph_nodes ).to(data_train.x.device)
            z_embed = torch.cat((z_nodes,z_sub),-1)
           
 

        train_preds += [model_predictor(z_embed)]
 
    train_pred = torch.cat(train_preds, dim=-1)
    


    test_preds = []
    test_labels = (get_link_labels(pos_test_edge, neg_test_edge)).to(data_train.x.device)
    test_edge = torch.cat((pos_test_edge, neg_test_edge), dim=1)
    for perm in DataLoader(range(test_edge.size(1)), batch_size, shuffle=False):
        sub_adj = test_subgraph_adj[perm].to(data_train.x.device)
        sub_x = test_subgraph_features[perm].to(data_train.x.device)
        
        if args.GNN_type == 'autoencoder':
            z = model.encode(sub_x, sub_adj)
        else:
            z = model(sub_x, sub_adj)
        
        z1 = (z[: ,0])
        z2 = (z[: ,1])
        
        z_nodes = torch.cat([z1,z2], dim=-1)
        
        
        if args.subgraph_feature_type == 'CNN':
            sub_embed = subgraph2vec(A_test, z, num_subgraph_nodes ).to(data_train.x.device)
            z_sub = model_subgraph(sub_embed)
            z_embed = torch.cat((z_nodes,z_sub),-1)
            
        elif args.subgraph_feature_type == 'NDP':
            z_sub = subgraph2vec(A_test, z, num_subgraph_nodes ).to(data_train.x.device)
            z_embed = torch.cat((z_nodes,z_sub),-1)
        
        
 

        test_preds += [model_predictor(z_embed)]
    test_pred = torch.cat(test_preds, dim=-1)
    


    
    train_logits = train_pred.to(data_train.x.device)
    test_logits = test_pred.to(data_train.x.device)
    

    
    accu_train = accuracy(train_logits, train_labels)
    accu_test = accuracy(test_logits, test_labels)
    

    recall_train = recall(train_logits, train_labels)
    recall_test = recall(test_logits, test_labels)
    
    precision_train = precision(train_logits, train_labels)
    precision_test = precision(test_logits, test_labels)
    
    f1_train = f1(train_logits, train_labels)
    f1_test = f1(test_logits, test_labels)
    
    avg_precision_train = average_precision(train_logits, train_labels)
    avg_precision_test = average_precision(test_logits, test_labels)
    
    roc_auc_train = roc(train_logits, train_labels)
    roc_auc_test = roc(test_logits, test_labels)
    
    pr_auc_train = precision_recall_AUC(train_logits, train_labels)
    pr_auc_test = precision_recall_AUC(test_logits, test_labels)
    
    precision_list_train, recall_list_train = precision_recall(train_logits, train_labels)
    precision_list_train = precision_list_train.tolist()
    recall_list_train = recall_list_train.tolist()
    precision_list_test, recall_list_test = precision_recall(test_logits, test_labels)
    precision_list_test = precision_list_test.tolist()
    recall_list_test = recall_list_test.tolist()
    
    results = {'train': {'accu_train': accu_train, 'recall_train':recall_train, 'precision_train':precision_train, 
                        'f1_train':f1_train, 'avg_precision_train':avg_precision_train, 'pr_auc_train':pr_auc_train, 
                        'precision_list_train':precision_list_train, 'recall_list_train':recall_list_train,
                        'pred': train_logits, 'y':train_labels, 'roc_auc_train':roc_auc_train}, 
                'test': {'accu_test': accu_test, 'recall_test':recall_test, 'precision_test':precision_test, 
                        'f1_test':f1_test, 'avg_precision_test':avg_precision_test, 'pr_auc_test':pr_auc_test,
                        'precision_list_test':precision_list_test, 'recall_list_test':recall_list_test, 
                        'pred': test_logits, 'y':test_labels, 'roc_auc_test':roc_auc_test}}
    



    return results


In [35]:
def parse_arguments(device=0, dataset='other', dataset_name='karate', subgraph_type='hhop', dist_type='norm', 
                    network_type=0, feature_type='onehot', subgraph_feature_type='CNN',
                    negative_injection=True, log_steps=1, GNN_type='gcn', GNN_num_layers=3, GNN_hidden_channels=128,
                    GNN_out_channels=128, linear_num_layers=5, linear_hidden_channels=32, n2v_dim=128,
                    subg2vec_hidden_channels=128, subg2vec_out_channels=128, max_hop=10, dropout=0.0, no_start_run=1,
                    batch_size=50, lr=0.0001 ,epochs=300, eval_steps=300, test_ratio=0.1,runs=10,
                    coefficient=50, label='dist'):
    
    parser = argparse.ArgumentParser(description='GAE_subgraph')
    parser.add_argument('--device', type=int, default=device)
    parser.add_argument('--dataset', type=str, default=dataset) #pyg, ogb, networkx
    parser.add_argument('--dataset_name', type=str, default=dataset_name) #'ogbl-collab', 'football', 'FB15k-237',
    #'karate', 'USAir', 'PB'
    parser.add_argument('--subgraph_type', type=str, default=subgraph_type)
    parser.add_argument('--dist_type', type=str, default=dist_type)#'norm', 'cos'
    parser.add_argument('--network_type', type=int, default=network_type)#if directed -> 0, if undirected -> 1
    parser.add_argument('--feature_type', type=str, default=feature_type)#'node2vec', 'onehot'
    parser.add_argument('--subgraph_feature_type', type=str, default=subgraph_feature_type)#'CNN', 'NDP'
    parser.add_argument('--n2v_dim', type=int, default=n2v_dim)
    parser.add_argument('--negative_injection', type=bool, default=negative_injection)
    parser.add_argument('--log_steps', type=int, default=log_steps)
    #parser.add_argument('--use_node_embedding', action='store_true')
    parser.add_argument('--GNN_type', type=str, default=GNN_type)#'gcn', 'sage', 'autoencoder'
    parser.add_argument('--GNN_num_layers', type=int, default=GNN_num_layers)
    parser.add_argument('--GNN_hidden_channels', type=int, default=GNN_hidden_channels)
    parser.add_argument('--GNN_out_channels', type=int, default=GNN_out_channels)
    parser.add_argument('--linear_num_layers', type=int, default=linear_num_layers)
    parser.add_argument('--linear_hidden_channels', type=int, default=linear_hidden_channels)
    parser.add_argument('--subg2vec_hidden_channels', type=int, default=subg2vec_hidden_channels)
    parser.add_argument('--subg2vec_out_channels', type=int, default=subg2vec_out_channels)
    parser.add_argument('--max_hop', type=int, default=max_hop)
    parser.add_argument('--dropout', type=float, default=dropout)
    parser.add_argument('--batch_size', type=int, default=batch_size)
    parser.add_argument('--lr', type=float, default=lr)
    parser.add_argument('--epochs', type=int, default=epochs)
    parser.add_argument('--eval_steps', type=int, default=eval_steps)
    parser.add_argument('--test_ratio', type=float, default=test_ratio)
    parser.add_argument('--runs', type=int, default=runs)
    parser.add_argument('--coefficient', type=int, default=coefficient)
    parser.add_argument('--no_start_run', type=int, default=no_start_run)
    parser.add_argument('--graphlet_size',type=int,default=4, help='Maximal graphlet size.')
    parser.add_argument('--label', type=str, default=label)
    
    args = parser.parse_args(args=[])
    
    return(args)
    

In [36]:
def main():

    print(args)

#******* device*******
    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = 'cpu'
    device = torch.device(device)
    
    print(device)   
    print(datetime.now())
#*******dataset*******
    if args.dataset == 'ogb':
        dataset = PygLinkPropPredDataset(name=args.dataset_name)
        data = dataset[0]
        split_edge = dataset.get_edge_split()
        pos_train_edge = split_edge['train']['edge']
        split_edge['train']['edge_neg'] = negative_sampling(data.edge_index, num_nodes=data.num_nodes,
            num_neg_samples=pos_train_edge.size(0)).T
        
        edge_weight = torch.ones(data.edge_index.size(1), dtype=int)
        A = ssp.csr_matrix((edge_weight, (data.edge_index[0], data.edge_index[1])), 
                           shape=(data.num_nodes, data.num_nodes))
        n_v = data.num_nodes
        n_e = data.num_edges
        num_subgraph_nodes = round((2*n_e/n_v)*(1+((2*n_e)/((n_v)*(n_v-1)))))#PLACN

        
        
        
    elif args.dataset == 'other':
        data_train, data_test, A, A_train, A_test, edge_pos_train,\
        edge_neg_train, edge_pos_test, edge_neg_test, num_subgraph_nodes,\
        sub_feature_train_matrix, sub_adj_train_matrix, sub_feature_test_matrix,\
        sub_adj_test_matrix, num_nodes = datasets()

        
        split_edge = {'train': {'edge':edge_pos_train, 
                                  'edge_neg':edge_neg_train},
                                  'test':{'edge':edge_pos_test, 
                                  'edge_neg':edge_neg_test} }
        
        
        
        
    data_train = data_train.to(device)
    data_test = data_test.to(device)

#*******create models*******    
    
    if args.GNN_type == 'gcn':
        model = GCN(len(sub_feature_train_matrix[0][0]), args.GNN_hidden_channels, args.GNN_out_channels, 
                               args.GNN_num_layers, args.dropout).to(device)
    if args.GNN_type == 'sage':
        model = SAGE(data_train.x.shape[1], args.GNN_hidden_channels, args.GNN_out_channels, 
                           args.GNN_num_layers, args.dropout).to(device)
    if args.GNN_type == 'autoencoder':
        model = GAE(GAutoEncoder(data_train.x.shape[1], args.GNN_hidden_channels, args.GNN_out_channels,
                                   args.GNN_num_layers, args.dropout).to(device))
        
   
    model_predictor = predictor_model(args.GNN_out_channels, args.linear_hidden_channels, 
                                          args.linear_num_layers, args.dropout).to(device)
    
    if args.subgraph_feature_type == 'CNN':
        model_subgraph = subg2vec_model(num_subgraph_nodes, args.GNN_out_channels, args.subg2vec_hidden_channels, 
                                          args.subg2vec_out_channels, args.dropout).to(device)
    
    coeff = args.coefficient

#*******train and test model*******    
    if args.dataset == 'ogb':
        print('OGB')
            
#**************            
    elif args.dataset == 'other':
    
        for run in range(args.no_start_run , 1 + args.runs):
            
            model.reset_parameters()
            model_predictor.reset_parameters()
            if args.subgraph_feature_type == 'CNN':
                model_subgraph.reset_parameters()
                
            optimizer = torch.optim.Adam(list(model.parameters()), lr=args.lr)
            optimizer_predictor = torch.optim.Adam(list(model_predictor.parameters()), lr=args.lr)
            if args.subgraph_feature_type == 'CNN':
                optimizer_subgraph = torch.optim.Adam(list(model_predictor.parameters()), lr=args.lr)


            min_loss = 1000
            losses = []
            
            for epoch in range(1, 1 + args.epochs):
                print(f'Epoch: {epoch:02d}, ')
#*******train*******
                print(datetime.now())

                if args.subgraph_feature_type == 'CNN':
                    loss = train(model, model_predictor, data_train, A_train, split_edge, sub_adj_train_matrix,
                                 sub_feature_train_matrix, optimizer, optimizer_predictor, args.batch_size,
                                 num_subgraph_nodes, model_subgraph, optimizer_subgraph)
                elif args.subgraph_feature_type == 'NDP':
                    loss = train(model, model_predictor, data_train, A_train, split_edge, sub_adj_train_matrix,
                                 sub_feature_train_matrix, optimizer, optimizer_predictor, args.batch_size,
                                 num_subgraph_nodes)

                print(datetime.now())



                min_loss = loss
                       
                if epoch % coeff == 0 :

                    torch.save(model.state_dict(), 'models/%s_%s_%s_model_%s_%s_%s_epochs%d_run%d.pth' 
                               %(args.dataset_name, args.feature_type, args.GNN_type, args.subgraph_type,
                                 args.subgraph_feature_type, args.dist_type, epoch, run))

                    torch.save(model_predictor.state_dict(), 'models/%s_%s_%s_model_predictor_%s_%s_%s_epochs%d_run%d.pth' 
                               %(args.dataset_name, args.feature_type, args.GNN_type, args.subgraph_type,
                                 args.subgraph_feature_type, args.dist_type, epoch, run))

                    if args.subgraph_feature_type == 'CNN':

                        torch.save(model_subgraph.state_dict(), 'models/%s_%s_%s_model_subgraph_%s_%s_%s_epochs%d_run%d.pth' 
                                   %(args.dataset_name, args.feature_type, args.GNN_type, args.subgraph_type,
                                     args.subgraph_feature_type, args.dist_type, epoch, run))

                    losses.append(min_loss)

                if epoch % args.eval_steps == 0:

                    results = []
                    
                    for e in range(1,args.epochs+1):
                        if e%coeff == 0:
#*******test*******                               
                            model.load_state_dict(torch.load('models/%s_%s_%s_model_%s_%s_%s_epochs%d_run%d.pth' 
                               %(args.dataset_name, args.feature_type, args.GNN_type, args.subgraph_type,
                                 args.subgraph_feature_type, args.dist_type, e, run)))

                            model_predictor.load_state_dict(torch.load('models/%s_%s_%s_model_predictor_%s_%s_%s_epochs%d_run%d.pth' 
                               %(args.dataset_name, args.feature_type, args.GNN_type, args.subgraph_type,
                                 args.subgraph_feature_type, args.dist_type, e, run)))

                            if args.subgraph_feature_type == 'CNN':
                                model_subgraph.load_state_dict(torch.load('models/%s_%s_%s_model_subgraph_%s_%s_%s_epochs%d_run%d.pth' 
                                       %(args.dataset_name, args.feature_type, args.GNN_type, args.subgraph_type,
                                         args.subgraph_feature_type, args.dist_type, e, run)))

                                result = test(model, model_predictor, data_train, A_train, data_test, A_test,
                                              split_edge, sub_adj_train_matrix,
                                              sub_feature_train_matrix, sub_adj_test_matrix, sub_feature_test_matrix,
                                              args.batch_size, num_subgraph_nodes, model_subgraph)
                            elif args.subgraph_feature_type == 'NDP':
                                result = test(model, model_predictor, data_train, A_train, data_test, A_test,
                                              split_edge, sub_adj_train_matrix,
                                              sub_feature_train_matrix, sub_adj_test_matrix, sub_feature_test_matrix,
                                              args.batch_size, num_subgraph_nodes)



                            results.append(result)
                            
                            pred_train = result['train']['pred']
                            pred_test = result['test']['pred']

                            y_train = result['train']['y']
                            y_test = result['test']['y'] 
                            
                            accu_train = result['train']['accu_train']
                            accu_test = result['test']['accu_test']

                            recall_train = result['train']['recall_train']
                            recall_test = result['test']['recall_test']

                            precision_train = result['train']['precision_train']
                            precision_test = result['test']['precision_test']

                            f1_train = result['train']['f1_train']
                            f1_test = result['test']['f1_test']

                            avg_precision_train = result['train']['avg_precision_train']
                            avg_precision_test = result['test']['avg_precision_test']

                            pr_auc_train = result['train']['pr_auc_train']
                            pr_auc_test = result['test']['pr_auc_test']
                            
                            roc_auc_train = result['train']['roc_auc_train']
                            roc_auc_test = result['test']['roc_auc_test']


                            precision_list_train = result['train']['precision_list_train']
                            recall_list_train = result['train']['recall_list_train']

                            precision_list_test = result['test']['precision_list_test']
                            recall_list_test = result['test']['recall_list_test']
 
    #*******print results*******                                
                                
                            i = (int(e/coeff))-1
                            with open('results/%s_%s_%s_model_%s_%s_%s_epochs%d_run%d.csv' 
                               %(args.dataset_name, args.feature_type, args.GNN_type, args.subgraph_type,
                                 args.subgraph_feature_type, args.dist_type, e, run), mode='w') as csv_file:
                                fieldnames = ['loss', 'accu_train', 'accu_test', 'recall_train',
                                              'recall_test', 'precision_train', 'precision_test',
                                             'f1_train', 'f1_test', 'avg_precision_train', 
                                              'avg_precision_test', 'pr_auc_train', 'pr_auc_test', 
                                             'precision_list_train', 'recall_list_train',
                                             'precision_list_test', 'recall_list_test',
                                             'roc_auc_train', 'roc_auc_test']
                                writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

                                writer.writeheader()
                                writer.writerow({'loss':losses[i], 'accu_train': accu_train,  
                                                 'accu_test': accu_test, 'recall_train':recall_train, 
                                                 'recall_test':recall_test, 'precision_train':precision_train,
                                                 'precision_test':precision_test,
                                                 'f1_train':f1_train,'f1_test':f1_test,
                                                 'avg_precision_train':avg_precision_train,
                                                 'avg_precision_test':avg_precision_test,
                                                 'pr_auc_test':pr_auc_test, 
                                                 'precision_list_train':precision_list_train, 
                                                 'recall_list_train':recall_list_train,
                                                 'precision_list_test':precision_list_test, 
                                                 'recall_list_test':recall_list_test,
                                                 'roc_auc_train': roc_auc_train, 'roc_auc_test': roc_auc_test})
                                
                                
                                with open('results/pred_%s_%s_%s_model_%s_%s_%s_epochs%d_run%d.csv' 
                                   %(args.dataset_name, args.feature_type, args.GNN_type, args.subgraph_type,
                                     args.subgraph_feature_type, args.dist_type, e, run), mode='w') as csv_file:
                                    fieldnames = ['pred_train', 'pred_test', 'y_train', 'y_test']
                                    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

                                    writer.writeheader()
                                    writer.writerow({ 'pred_train': pred_train.tolist(), 'pred_test':pred_test.tolist(), 
                                                     'y_train':y_train.tolist(), 'y_test':y_test.tolist() })


                            print(f'Best Result in epochs {e:02d} and Run {run:02d} ')
                            print('---')


                            print(f'Loss_model: {losses[i]:.4f}')

                            print(f'Accu_Train: {accu_train:.4f},     '
                                  f'Accu_Test: {accu_test:.4f}')

                            print(f'Recall_Train: {recall_train:.4f},     '
                                  f'Recall_Test: {recall_test:.4f}')

                            print(f'Precision_Train: {precision_train:.4f},     '
                                  f'Precision_Test: {precision_test:.4f}')

                            print(f'F1_Score_Train: {f1_train:.4f},     ' 
                                  f'F1_Score_Test: {f1_test:.4f}')

                            print(f'Avrage_precision_Train: {avg_precision_train:.4f},     ' 
                                  f'Avrage_precision_Test: {avg_precision_test:.4f}')

                            print(f'Precision-Recall_AUC_Train: {pr_auc_train:.4f},     '
                                  f'Precision-Recall_AUC_Test: {pr_auc_test:.4f}')
                            
                            print(f'ROC_AUC_Train: {roc_auc_train:.4f},     '
                                  f'ROC_AUC_Test: {roc_auc_test:.4f}')

                            print('---')



        results=[]

        for r in range(1,args.runs+1):
            result=[]
            for e in range(1,args.epochs+1):

                if (e)%args.coefficient==0:
                    df = pd.read_csv('results/%s_%s_%s_model_%s_%s_%s_epochs%d_run%d.csv' 
                           %(args.dataset_name, args.feature_type, args.GNN_type, args.subgraph_type,
                             args.subgraph_feature_type, args.dist_type, e, r))
                    result.append([df['loss'].item(), df['accu_train'].item(), df['accu_test'].item(), df['recall_train'].item(),
                                      df['recall_test'].item(), df['precision_train'].item(), df['precision_test'].item(),
                                     df['f1_train'].item(), df['f1_test'].item(), df['avg_precision_train'].item(),
                                      df['avg_precision_test'].item(), df['pr_auc_train'].item(), df['pr_auc_test'].item(),
                                      df['roc_auc_train'].item(), df['roc_auc_test'].item()])
            results.append(result)
        sum_results = torch.zeros([15])

        for r in range(args.runs):
            sum_results = sum_results+torch.tensor(results[r])

        final_result_model = sum_results/args.runs


        print('--------------------------------------------------------------------')
        print('--------------------------------------------------------------------')
        print(f'Final Results')
        print('---')

        for e in range(1,args.epochs+1):
            if e%coeff == 0:

                i = int(e/coeff)-1

                print(f'Final Results Epoch: {e:02d}')
                print('---')
                print(f'Loss_model: {final_result_model[i][0].item():.4f} ')

                print(f'Accu_Train: {final_result_model[i][1].item():.4f},     '
                      f'Accu_Test: {final_result_model[i][2].item():.4f}')

                print(f'Recall_Train: {final_result_model[i][3].item():.4f},     '
                      f'Recall_Test: {final_result_model[i][4].item():.4f}')

                print(f'Precision_Train: {final_result_model[i][5].item():.4f},     '
                      f'Precision_Test: {final_result_model[i][6].item():.4f}')

                print(f'F1_Score_Train: {final_result_model[i][7].item():.4f},     ' 
                      f'F1_Score_Test: {final_result_model[i][8].item():.4f}')

                print(f'Avrage_precision_Train: {final_result_model[i][9].item():.4f},     ' 
                      f'Avrage_precision_Test: {final_result_model[i][10].item():.4f}')

                print(f'Precision-Recall_AUC_Train: {final_result_model[i][11].item():.4f},     '
                      f'Precision-Recall_AUC_Test: {final_result_model[i][12].item():.4f}')
                
                print(f'ROC_AUC_Train: {final_result_model[i][13].item():.4f},     '
                      f'ROC_AUC_Test: {final_result_model[i][14].item():.4f}')
                print('---')


                with open('results/%s_%s_%s_model_%s_%s_%s_epochs%d_run%d_final.csv' 
                            %(args.dataset_name, args.feature_type, args.GNN_type, args.subgraph_type,
                            args.subgraph_feature_type, args.dist_type, e, args.runs) , mode='w') as csv_file:
                    fieldnames = ['loss', 'accu_train', 'accu_test', 'recall_train',
                                  'recall_test', 'precision_train', 'precision_test',
                                 'f1_train', 'f1_test', 'avg_precision_train',
                                  'avg_precision_test', 'pr_auc_train', 'pr_auc_test',
                                  'roc_auc_train', 'roc_auc_test']
                    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

                    writer.writeheader()
                    writer.writerow({'loss':final_result_model[i][0].item(), 
                                     'accu_train': final_result_model[i][1].item(), 
                                     'accu_test': final_result_model[i][2].item(),  
                                     'recall_train':final_result_model[i][3].item(), 
                                     'recall_test':final_result_model[i][4].item(), 
                                     'precision_train':final_result_model[i][5].item(),
                                     'precision_test':final_result_model[i][6].item(),
                                     'f1_train':final_result_model[i][7].item(),
                                     'f1_test':final_result_model[i][8].item(),
                                     'avg_precision_train':final_result_model[i][9].item(),
                                     'avg_precision_test':final_result_model[i][10].item(), 
                                     'pr_auc_train':final_result_model[i][11].item(),
                                     'pr_auc_test':final_result_model[i][12].item(),
                                     'roc_auc_train':final_result_model[i][13].item(),
                                     'roc_auc_test':final_result_model[i][14].item(),
                                     
                                     })
        print(datetime.now())


In [None]:
args = parse_arguments(dataset='other', dataset_name='Karate', subgraph_type='DIS', dist_type='cos',
                           feature_type='node2vec', subgraph_feature_type='NDP', GNN_type='gcn', epochs=200,
                           eval_steps=200, no_start_run=1, runs=1, coefficient=50)

if __name__ == "__main__":  
    main()