# struc2vec.py

In [None]:
from struc2vec.src.PreProcess import *
from struc2vec.src.GraphAlgorithms import GraphAlgorithms as ga
from struc2vec.src.RandomWalker import *
import networkx as nx

class struc2vec():
    def __init__(self, G, preprocess=True):
        '''
        First a graph object is created, this stores the relevant information of the graph and allows the application
        of the struc2vec algorithm and storing the results.

        If graph is directed an undirected graph is made for estimating diameter and for getting neighborhoods in for context graph.

        The graph is set to be preprocess by default, if the context graph has already been generated and needs only to be loaded, it should be set to false.
        '''
        self.is_directed = nx.is_directed(G)
        self.G_UD = nx.Graph(G)
        if self.is_directed:
            self.G_D = G
        self.nodes = [*self.G_UD.nodes] # List of all nodes
        self.nodePairs = self.getNodePairs()
        self.diameter = nx.diameter(self.G_UD)
        if preprocess:
            self.DegreeSequences = getDegreeSequences(self.G_UD, self.G_D)
    
    def getNodePairs(self):
        # Get all unique node pairs as tuples ignoring order
        nodePairs = []
        for i, v in enumerate(self.nodes[:-1]):
            for d in self.nodes[i+1:]:
                nodePairs.append((v,d))
        return nodePairs
    
    def getMultiLevelGraph(self, path=None):
        """
        This function generates and saves the context graph for the random walks. It has no output, but saves the G_ML for the object.

        If the context graph has been created, it takes the path as input, and loads and stores the G_ML in the object instead.
        """
        if path:
            if path[-1] != "/":
                path = path + "/"
        self.G_ML, self.adj_dicts = ga().MultiLevelGraph(self,self.diameter,path=path)
        self.n_layers = self.diameter
        self.upweightdict = ga().getUpWeightDict(self.G_ML)

    def getRandomWalks(self, start_node=None, number_of_walks=100, walk_length=10, q=0.2):
        walks = random_walk(self,start_node, number_of_walks, walk_length, q)
        return walks
        


# GraphAlgrorithm.py

In [None]:
import numpy as np
import fastdtw
import networkx as nx
import os

class GraphAlgorithms():
    def __init__(self):
        '''
        This object contains the different functions to be applied on the graph object for the struc2vec algorithm.
        '''
        pass
    
    # def getDegreeSequence(self, s2vG, origin, n_steps):
    #     '''
    #     The function returns a sorted degree sequence for a given node based on 
    #     n-step neighbors.
    #     '''
    #     visitedNodes = {}
    #     que = [origin]
    #     for _ in range(n_steps):
    #         newQue = []
    #         for node_0 in que:
    #             if node_0 not in visitedNodes:
    #                 visitedNodes[node_0] = len(s2vG.G[node_0])
    #             for node_1 in s2vG.G[node_0]:
    #                 if node_1 not in visitedNodes:
    #                     newQue.append(node_1)
    #         que = newQue
    #     degreeSequence = sorted(visitedNodes.values(), reverse=True)
    #     return degreeSequence
    
    # def getDSlayer(self, s2vG, n_steps):
    #     '''
    #     For a given layer, get the degree sequence for all nodes
    #     '''
    #     nodeDS = {} # Container for Degree sequences
    #     for v in s2vG.nodes:
    #         # Calculate degree sequence and store it indexed by node
    #         DS = self.getDegreeSequence(s2vG, v, n_steps)
    #         nodeDS[v] = DS
    #     return nodeDS
    
    # def calcDTW(self, s2vG, n_steps):
    #     '''
    #     Calculate DTW distance between DS for each node in nodepairs and return the distance between all nodepairs
    #     '''
    #     # For the calculation, we first need to get the degree sequence for each node
    #     nodeDS = self.getDSlayer(s2vG, n_steps)
        
    #     ssm_nodepairs = {}
    #     for v0, v1 in s2vG.nodePairs:
    #         # Each DS is converted to an numpy array and reshaped into 1-D arrays (or rather (n,1) 
    #         # where n is the length of the array)
    #         ds0 = np.array(nodeDS[v0])
    #         ds0 = ds0.reshape(len(ds0),1) 
    #         ds1 = np.array(nodeDS[v1])
    #         ds1 = ds1.reshape(len(ds1),1)

    #         distance, path = fastdtw.fastdtw(ds0, ds1, dist=self.d_func)
    #         ssm_nodepairs[(v0,v1)] = distance
    #     return ssm_nodepairs

    def d_func(self,a,b):
        '''
        Calculate distance
        '''
        return float((max(a,b)/min(a,b))-1)
    
    def calculateDistances(self, s2vG, n_steps):
        """
        The function utilizes the preprocessed degreesequences and calculates a distance
        measure for the in- and out-degree sequences between all node pairs.

        The final distance measure is the average of the in- and out-distances.

        The output is a dictionary with a tuple of the nodepairs as key and distance as value
        """
        edgelist_dist = []
        for v0, v1 in s2vG.nodePairs:
            arr0_in = np.array(sorted(s2vG.DegreeSequences[v0][n_steps]["in"], reverse=True))
            arr0_out = np.array(sorted(s2vG.DegreeSequences[v0][n_steps]["out"], reverse=True))
            arr1_in = np.array(sorted(s2vG.DegreeSequences[v1][n_steps]["in"], reverse=True))
            arr1_out = np.array(sorted(s2vG.DegreeSequences[v1][n_steps]["out"], reverse=True))

            arr0_in = arr0_in.reshape(len(arr0_in),1)
            arr1_in = arr1_in.reshape(len(arr1_in),1)
            arr0_out = arr0_out.reshape(len(arr0_out),1)
            arr1_out = arr1_out.reshape(len(arr1_out),1)

            arr0_in = arr0_in + np.ones((len(arr0_in),1))
            arr1_in = arr1_in + np.ones((len(arr1_in),1))
            arr0_out = arr0_out + np.ones((len(arr0_out),1))
            arr1_out = arr1_out + np.ones((len(arr1_out),1))

            dist_in, conv_vect = fastdtw.fastdtw(arr0_in, arr1_in, dist=self.d_func)
            dist_out, conv_vect = fastdtw.fastdtw(arr0_out, arr1_out, dist=self.d_func)

            prev_weight = 0
            if n_steps != 0:
                prev_weight = self.G_ML[n_steps-1].edges[v0,v1]["weight"]
            edge = (v0,v1,{"weight": prev_weight + np.exp(-np.mean([dist_in, dist_out]))})
            edgelist_dist.append(edge)
        return edgelist_dist

    # def getStrucGraph(self, s2vG, n_steps):
    #     '''
    #     This function will calculate the context graph for a specified layer in graph generated from structural similarity.

    #     It takes s2vG-object as input and the number of layers of neighbors to include for comparison of each node pair.
    #     '''
    #     G = nx.Graph()
        
    #     # To generate the graph we need to calculate the similarity scores for all nodepairs
    #     # To do this, we will calculate the DTW distance based on similarity measures of each node
    #     dist_nodepairs = self.calculateDistances(s2vG, n_steps)
        
    #     # Now all the edges are added and the weight is calculated
    #     for (v0,v1), dist in dist_nodepairs.items():
    #         G.add_edge(v0,v1,weight=dist)
    #     return G
    
    def MultiLevelGraph(self, s2vG, n_level, path=None):
        """
        This function takes a s2vG-object as input and generates the responding context graph for it. If a path is defined, it loads the graph otherwise the graph is 
        created. It outputs the context graph as a dictionary object, with layers as keys and graphs as values.
        """
        self.G_ML = {}
        if path:
            for file in os.listdir(path):
                layer_n = int(file.split(".")[0])
                graph = nx.read_gexf(path + file)
                self.G_ML[layer_n] = graph
        else:
            for i in range(n_level+1):
                edgelist_i = self.calculateDistances(s2vG, i)
                G_i = nx.Graph(edgelist_i)
                self.G_ML[i] = G_i

        adj_dicts = self.getAdjacencyDicts(self.G_ML)
        return self.G_ML, adj_dicts

    
    def getAdjacencyDicts(self, G_ML):
        adj_dicts = {}
        for layer, graph in G_ML.items():
            adj_dicts[layer] = {}
            for node, neighbors in graph.adjacency():
                adj_dicts[layer][node] = {neighbor:attr["weight"] for neighbor, attr in neighbors.items()}
        return adj_dicts
    
    def getUpWeightsLayer(self, G):
        '''
        The function takes a graph as input, and calculates the edge weight for all node-node edges from layer n 
        to layer n+1.
        It returns a dictionary with nodes as keys and weight as value.
        '''
        # Calculate mean weight of all edges in graph
        avg_edge_weight = np.mean([G.edges[edge]["weight"] for edge in G.edges])

        # Dict for edges
        upEdge = {}

        # Get weight for each node
        for node in G.nodes:
            aboveAvgEdgeWeight = np.mean([int(att["weight"] >= avg_edge_weight) for edge, att in G[node].items()])
            upWeight = np.log(aboveAvgEdgeWeight + np.exp(1))
            upEdge[node] = upWeight / (upWeight+1)

        return upEdge


    def getUpWeightDict(self, G_multilayer):
        '''
        The function takes a dictionary for a multilayer graph as input, and returns a 
        dictionary of all node-node edge weights for each layer in the multilayer graph.
        The function is an iterator, utilizing the function getUpWeightsLayer to retrieve weights for each layer
        '''
        upWeightDict = {}
        for layer, G in G_multilayer.items():
            upWeightsLayer = self.getUpWeightsLayer(G)
            upWeightDict[layer] = upWeightsLayer
        return upWeightDict
    

# Helpers.py

In [None]:
import numpy as np
import pandas as pd
import networkx as nx

def scale_weights(weights, min_scale=0.1, max_scale=10):
    """
    The functions takes a list of weights as input and rescales the values to 
    the set range min_scale-max_scale
    """
    min_weight = np.min(weights)
    max_weight = np.max(weights)
    scale = lambda x: (((max_scale-min_scale)*(x-min_weight))/(max_weight-min_weight))+min_scale
    return [scale(x) for x in weights]

def get_meta_data(G, nodes, embeddings, cmap=None):
    df_meta = pd.DataFrame([(node, embedding) for node,embedding in zip(nodes,embeddings)], columns=["Node", "Embedding"])

    # Clustering
    clustering = nx.clustering(G)
    df_meta["Clustering"] = [clustering[node] for node in df_meta["Node"]]

    # Betweenness centrality
    betweenness = nx.betweenness_centrality(G)
    df_meta["Betweenness_centrality"] = [betweenness[node] for node in df_meta["Node"]]

    # Closeness centrality
    closeness = nx.closeness_centrality(G)
    df_meta["Closeness_centrality"] = [closeness[node] for node in df_meta["Node"]]

    # In- and out-degree
    A = nx.adjacency_matrix(G, weight=None) # All cells are either 0 or 1 to support counting for degree
    ind2node = {i:node for i, node in enumerate(G.nodes())}
    # From the adj matrix, the in and out degree can be calculated as a sum of each row
    # and corresponding column
    degreeDict = {}
    for i in range(A.shape[0]):
        inDegree = A[:,[i]].sum()
        outDegree = A[[i],:].sum()
        degreeDict[ind2node[i]] = {"inDegree":inDegree, "outDegree":outDegree}
    df_meta["InDegree"] = [degreeDict[node]["inDegree"] for node in df_meta["Node"]]
    df_meta["OutDegree"] = [degreeDict[node]["outDegree"] for node in df_meta["Node"]]

    # Average weight (currently for bot in- and out-edges)
    avg_weight = {}
    for node in G.nodes():
        data = G[node]
        weights = []
        for dest, data in data.items():
            weights.append(data["weight"])
        avg_weight[node] = np.mean(weights)
    df_meta["Average_weight"] = [avg_weight[node] for node in df_meta["Node"]]

    # Groupby embeddings
    df_grouped = pd.DataFrame(df_meta.groupby("Embedding")["Node"].count())
    df_grouped.columns = ["Number of nodes"]

    if cmap:
        df_grouped["Color"] = [cmap[emb] for emb in df_grouped.index]
    
    avg_data = df_meta.drop("Node",axis=1).groupby("Embedding").mean().apply(lambda x: round(x, 3))

    return df_meta, df_grouped.merge(avg_data, right_index=True, left_index=True)


# PreProcess.py

In [None]:
import networkx as nx

def getDegreeSequences(G_UD, G_D):
    """
    The function takes an undirected and a directed network as input (from the same graph).

    It calculates the in- and out-degree sequences for each node for all layers of
    a the nodes neighborhoods

    The final output is a dictionary in the style {origin_node: {0:self, 1:degree_sequence 0 extended with 
    degree sequence from all immidiate neighbors, 2: degree_sequence 1 extended with degrees for 2-step neighbors}}
    """
    # 0
    A = nx.adjacency_matrix(G_D, weight=None) # All cells are either 0 or 1 to support counting for degree
    ind2node = {i:node for i, node in enumerate(G_D.nodes())}
    diameter = nx.diameter(G_UD)
    
    # 1
    # From the adj matrix, the in and out degree can be calculated as a sum of each row
    # and corresponding column
    degreeDict = {}
    for i in range(A.shape[0]):
        inDegree = A[:,[i]].sum()
        outDegree = A[[i],:].sum()
        degreeDict[ind2node[i]] = {"inDegree":inDegree, "outDegree":outDegree}

    # 2
    # For all node pairs the length of the shortest path are estimated, this will
    # then be used for generating the layers of neighborhoods by 'reversing' the dictionary
    all_shortest_paths = {node:paths for node, paths in nx.all_pairs_shortest_path_length(G_UD)}

    neighborhood_by_layer = {}
    for origin_node, neighbors in all_shortest_paths.items():
        neighborhood_by_layer[origin_node] = {}
        for node, layer in neighbors.items():
            if layer not in neighborhood_by_layer[origin_node]:
                neighborhood_by_layer[origin_node][layer] = []
            neighborhood_by_layer[origin_node][layer].append(node)

    # 3
    # Now each vector of nodes generated in #2 are converted to their corresponding in-
    # and out-degree which was calculated in #1.
    # Each degree sequence is aggregated from the current layer and all previous layers.
    degree_vectors = {}

    for origin_node, neighborhoods in neighborhood_by_layer.items():
        degree_vectors[origin_node] = {}
        for layer in range(diameter+1):
            # DEL: [If all nodes are already included, the vec-in and vec_out are not recalculated but are added for all further layers]
            if layer in neighborhoods.keys():
                nodes = neighborhoods[layer]
                vector_in = [degreeDict[node]["inDegree"] for node in nodes]
                vector_out = [degreeDict[node]["outDegree"] for node in nodes]
                degree_vectors[origin_node][layer] = {"in": vector_in, "out":vector_out}
            else:
                degree_vectors[origin_node][layer] = {"in": [1], "out":[1]}
            #     if layer == 0:
            #         degree_vectors[origin_node][layer] = {"in": vector_in, "out":vector_out}
            #         continue
            #     else:
            #         vec_in_prev = degree_vectors[origin_node][layer-1]["in"].copy()
            #         vec_in_prev.extend(vector_in)

            #         vec_out_prev = degree_vectors[origin_node][layer-1]["out"].copy()
            #         vec_out_prev.extend(vector_out)

            # degree_vectors[origin_node][layer] = {}
            # degree_vectors[origin_node][layer]["in"] = vec_in_prev
            # degree_vectors[origin_node][layer]["out"] = vec_out_prev

    # 4
    # Finally return the dictionary
    return degree_vectors



# RandomWalker.py

In [None]:
import numpy as np


def random_walk_step(current_node, s2vG, n_layer):
    '''
    The function picks a random node from the current nodes neighbors based on probabilities calculated from adjacency dict
    '''
    adj_dict = s2vG.adj_dicts[n_layer]
    weight_total = sum(adj_dict[current_node].values())
    neigh = list(adj_dict[current_node].keys())
    prob = [w / weight_total for w in adj_dict[current_node].values()]
    return np.random.choice(neigh, size=1, p=prob)[0]

def random_walk(s2vG, start_node, number_of_walks, walk_length, q):
    # Start at the bottom layer
    n_layer = 0
    walks = []
    for i in range(number_of_walks):
        current_node = start_node if start_node else np.random.choice(s2vG.nodes)
        walk = [current_node]
        for j in range(walk_length):
            n_layer = getLayer(s2vG, n_layer, current_node, q)
            current_node = random_walk_step(current_node, s2vG, n_layer)
            walk.append(current_node)
        walks.append(walk)
    return walks

def getLayer(s2vG, n_layer, current_node, q):
    if np.random.random() > q:
        return n_layer
    else:
        up_p = s2vG.upweightdict[n_layer][current_node]
        if n_layer == 0:
            n_layer = 1
        elif n_layer == s2vG.n_layers:
            n_layer = n_layer - 1
        elif (np.random.random() >= up_p):
            n_layer = n_layer - 1
        else:
            n_layer = n_layer + 1
        return n_layer
