## Clustering on the event hyper graph
### Node similarity measure:
- topological overlap + event/sentence embeddings
### Group similarity measure:
- average similarity of node pairs between cluster pairs
### Procedure:
- convert hypergraph to weighted normal graph (or don't?)
- Assign each node to its own cluster and evaluate similarity measure for all node pairs
- merge node pairs with highest similarity measure into the same community 
- - how many pairs to merge?
- repeat by merging clusters in the same way until no merge is available
### dual of hypergraph
- link clustering can be achieved by doing clustering on the dual of a hypergraph

In [None]:
import networkx as nx
import hypernetx as hnx
import numpy as np
import json
import hypernetx.algorithms.hypergraph_modularity as hmod
import igraph as ig
from collections import defaultdict
import itertools
import copy
import time 
import sys
import operator
import math

In [None]:
# read network
B = nx.node_link_graph(json.load(open('data/result/RAMS/gpt_biHgraph_dev/hgraph.json')))
H = hnx.Hypergraph.from_bipartite(B)
list(H.shape)

## reduce hypergraph to two-section graph with edge reweighting proposed in [1]


[1] Kumar T., Vaidyanathan S., Ananthapadmanabhan H., Parthasarathy S. and Ravindran B. “A New Measure of Modularity in Hypergraphs: Theoretical Insights and Implications for Effective Clustering”. In: Cherifi H., Gaito S., Mendes J., Moro E., Rocha L. (eds) Complex Networks and Their Applications VIII. COMPLEX NETWORKS 2019. Studies in Computational Intelligence, vol 881. Springer, Cham

In [None]:
def embedding_vec2dict(embedding):
    return {i: embedding[i] for i in range(len(embedding))}

In [None]:
hyperedge_dict = json.load(open('data/result/RAMS/gpt_biHgraph_dev/hyperedges_w_embeddings.json'))

In [None]:
# clustering on hyperedges
dual_H = H.dual()
print(dual_H.shape)

In [None]:
component_subgraphs = dual_H.s_component_subgraphs(edges=False, return_singletons=True)
G_ccs = ig.Graph()
weights = defaultdict(lambda: defaultdict(dict))
total = 0
event_set = set()
total_edges = 0
for s_component in component_subgraphs:
    total += s_component.shape[0]
    if s_component.shape[0] == 1:
        event = list(s_component.nodes())[0]
        event_name = "-".join(v['name'].split('-')[1:])
        if event_name not in event_set:
            G_ccs.add_vertices(list(s_component.nodes()))
        continue
    cc = hmod.two_section(s_component)
    index2id_dict = {}
    for v in cc.vs:
        index2id_dict[v.index] = v['name']

    deleted_vertices = []
    for v in cc.vs:
        event_name = "-".join(v['name'].split('-')[1:])
        if event_name in event_set:
            deleted_vertices.append(v['name'])
        event_set.add(event_name)

    # cc.delete_vertices(deleted_vertices)

    deleted_edges = []
    for e in cc.es:
        if index2id_dict[e.source] in deleted_vertices or index2id_dict[e.target] in deleted_vertices:
            deleted_edges.append((e.source, e.target))
    cc.delete_edges(deleted_edges)

    # edges = [(e.source, e.target, e['weight']) for e in cc.es]
    print(cc.vcount())
    # G_ccs.add_vertices([v['name'] for v in cc.vs])
    total_edges += len(cc.es)

    # G_ccs.add_edges([(index2id_dict[e.source], index2id_dict[e.target]) for e in cc.es])
    for v in cc.vs:
        weights[v['name']][v['name']]['weight'] = 0
    if len(cc.es) != 0:
        for e in cc.es:
            weights[cc.vs[e.source]['name']][cc.vs[e.target]['name']]['weight'] = e['weight']
            weights[cc.vs[e.target]['name']][cc.vs[e.source]['name']]['weight'] = e['weight']

# print([G_cc.vcount() for G_cc in G_ccs])
# print(G_ccs.vcount())
print(total_edges, len(weights))
# GU = ig.union(G_ccs)

In [None]:
G_ccs = ig.Graph.DictDict(weights)
id2index_dict = {}
for v in G_ccs.vs:
    id2index_dict[v['name']] = v.index

In [None]:
A = G_ccs.get_adjacency(attribute='weight')

In [None]:
# hyperedge_embeddings = {hyperedge['id']: embedding_vec2dict(hyperedge['embedding']) for hyperedge in hyperedge_dict.values()}
hyperedge_embeddings = {hyperedge_id: hyperedge_data['embedding'] for hyperedge_id, hyperedge_data in hyperedge_dict.items()}
attr_dict = {v.index: hyperedge_embeddings[v['name']] for v in G_ccs.vs}

In [None]:
def ilouvain(G, attr_dict, D=None):
    """
    Modified version of the Louvain algorithm that takes embeddings into account
    """
    def generate_node_pair(arr):
        node_pairs = []
        for v1 in arr:
            for v2 in arr:
                node_pairs.append((v1, v2))
        return node_pairs

    def kro(c1, c2):
        return 1 if c1 == c2 else 0
    
    def weighted_degree(A):
        return {v: sum(A[v]) for v in range(0, len(A))}

    def QQ(P, G, A, D, K, attr_dict, I_vs):
        m = twod_sum(A)/2
        N = G.vcount()
        list_node_pairs = generate_node_pair([v.index for v in G.vs])
        print("node_pairs: ", len(list_node_pairs))
        I_V = Invertia(G, attr_dict)
        I_denominator_1 = (2*N*I_V)
        I_denominator_2 = (I_denominator_1)**2
        QQ_list = [
            (0,0) if kro(P[v1], P[v2]) == 0 else
            # Q_NG
            ((A[v1][v2] - K[v1]*K[v2])/(2*m)
            ,
            # Q_Invertia
            (I_vs[v1] * I_vs[v2]/(I_denominator_1) 
            - D[v1][v2]/(I_denominator_2)
            ))
            for v1, v2 in list_node_pairs
        ]
        original_stdout = sys.stdout # Save a reference to the original standard output
        with open('QQ_list.txt', 'w') as f:
            sys.stdout = f # Change the standard output to the file we created.
            print(QQ_list)
            sys.stdout = original_stdout # Reset the standard output to its 

        return np.sum([QQ_value[0] + QQ_value[1] for QQ_value in QQ_list])

        return np.sum([
            0 if kro(P[v1], P[v2]) == 0 else
            # Q_NG
            (2*m*min(A[v1][v2], 1) - G.degree(v1)*G.degree(v2))
            +
            # Q_Invertia
            (I_vs[v1] * I_vs[v2]/(I_denominator_1) 
            - D[v1][v2]/(I_denominator_2)
            )
            for v1, v2 in list_node_pairs
        ])
        
        # return Q_NG(P, G, A) + Q_Invertia(P, G, D, attr_dict)

    # def Q_NG(P, G, A):
    #     m = G.ecount()
    #     list_node_pairs = generate_node_pair([v.index for v in G.vs])
    #     return np.sum([(A[v1][v2] - G.degree(v1)*G.degree(v2)/(2*m)) * kro(P[v1], P[v2]) for v1, v2 in list_node_pairs])

    # def Q_Invertia(P, G, D, attr_dict):
    #     list_node_pairs = generate_node_pair([v.index for v in G.vs])
    #     N = len(G.vs)
    #     return np.sum([
    #         (Invertia(G, attr_dict, v1) * Invertia(G, attr_dict, v2)/((2*N*Invertia(G, attr_dict))**2) 
    #         - D[v1][v2]/(2*N*Invertia(G, attr_dict))
    #         ) * kro(P[v1], P[v2])
    #         for v1, v2 in list_node_pairs
    #     ])
    
    def Invertia(G, attr_dict, vp=None):
        N = G.vcount()
        if vp is None:
            g = np.sum([np.array(attr_dict[v.index]) for v in G.vs]) / N
            I = np.sum([np.linalg.norm(np.array(attr_dict[v.index])-g)**2 for v in G.vs])
            return I
        else:
            return np.sum([np.linalg.norm(np.array(attr_dict[vp]) - np.array(attr_dict[v.index]))**2 for v in G.vs]) 

    def delta_modular(A, K, C_x, C_1, x, m):
        # print(len(A), len(B))
        first_term = np.sum([
            A[v][x] - K[v]*K[x]/(2*m)
            for v in C_x
        ]) / m
        second_term = np.sum([
            A[v][x] - K[v]*K[x]/(2*m)
            for v in C_1
        ]) / m
        # print("delta: ", first_term, second_term)
        return first_term - second_term

    def delta_invertia(C_x, C_1, D, I_V_u, u, denom, I_vs):
        # print(len(A), len(B))
        first_term = np.sum([
            I_V_u * I_vs[v]/denom - D[u][v]
            for v in C_x
        ]) / (denom/2) 
        second_term = np.sum([
            I_V_u * I_vs[v]/denom - D[u][v]
            for v in C_1
        ]) / (denom/2)
        # print("delta: ", first_term, second_term)
        return first_term - second_term


    def semantic_neighbors(G, v, D):
        connectivity_neighbors = G.neighbors(v)
        semantic_neighbors = []
        for v2, distance in D[v].items():
            if distance < 0.4:
                semantic_neighbors.append(v2)
        return list(set(connectivity_neighbors + semantic_neighbors))

    def find_max_gain_comm(v, G, P, A, D, K, denom, I_vs):
        comms = defaultdict(list)
        for v_p, comm in P.items():
            comms[comm].append(v_p)
        neighbors = semantic_neighbors(G, v, D)
        max_gain = -1
        max_gain_comm = P[v]
        I_V_u = I_vs[v]
        m = twod_sum(A)/2
        gains = []
        for neighbor in neighbors:
            # new_QQ = QQ(P, G, A, D, attr_dict)
            C_x = comms[P[v]][:]
            C_x.remove(v)
            C_1 = comms[P[neighbor]][:]
            # C_1.append(v)
            # neighbor_start = time.process_time()
            d_modular = delta_modular(A, K, C_x, C_1, v, m)
            d_inertia = delta_invertia(C_x, C_1, D, I_V_u, v, denom, I_vs)
            gains.append((d_modular, d_inertia))
            QQ_gain = d_modular + d_inertia
            # print(invertia_gain, max_gain)
            # neighbor_duration = time.process_time() - neighbor_start
            # print("neighbor_duration: ", neighbor_duration)
            if QQ_gain > max_gain:
                max_gain = QQ_gain
                max_gain_comm = P[neighbor]
        original_stdout = sys.stdout # Save a reference to the original standard output
        with open('gains.txt', 'w') as f:
            sys.stdout = f # Change the standard output to the file we created.
            print(v)
            print(gains)
            sys.stdout = original_stdout # Reset the standard output to its 
        return max_gain_comm, max_gain

    def partition(G):
        P = {}
        for index, v in enumerate(G.vs):
            P[v.index] = index
        return P 

    def distance_matrix(G, attr_dict):
        def dist(vec1, vec2):
            return np.linalg.norm(np.array(vec1) - np.array(vec2))**2
        # create a list of list of distances
        D = defaultdict(lambda: defaultdict(float))
        for index1, v1 in enumerate(G.vs):
            for index2, v2 in enumerate(G.vs):
                v1_index = v1.index
                v2_index = v2.index
                embedding1 = attr_dict[v1_index]
                embedding2 = attr_dict[v2_index]
                distance = dist(embedding1, embedding2)
                D[v1_index][v2_index] = distance
                D[v2_index][v1_index] = distance
        return D
    
    def calculate_weights(comm1, comm2, A):
        total = 0
        for v1 in comm1:
            for v2 in comm2:
                total += A[v1][v2]
        return total


    def fusion_matrix_adjacency(A, comms):
        print("fusion matrix adjacency comms: ", len(comms))

        new_weights = defaultdict(lambda: defaultdict(dict))
        for comm1, vertices1 in comms.items():
            for comm2, vertices2 in comms.items():
                new_weights[comm1][comm2]['weight'] = calculate_weights(vertices1, vertices2, A)
                # weights[comm2][comm1]['weight'] = weights[comm1][comm2]['weight']
        print("new graph weight shape:", len(new_weights), len(new_weights[0]))
        clustered_G = ig.Graph.DictDict(new_weights)
        clustered_A = map_max(clustered_G.get_adjacency(attribute='weight'), 1)
        print("new graph shape:", clustered_G.vcount())
        return clustered_G, clustered_A

    def fusion_matrix_inertia(D, comms):
        D_prime = defaultdict(lambda: defaultdict(float))
        print("inertia matrix len: ", len(comms))
        for comm_x, x_vertices in comms.items():
            for comm_y, y_vertices in comms.items():
                x_to_y_pairs = list(itertools.product(x_vertices, y_vertices))
                D_prime[comm_x][comm_y] = np.sum([
                    D[v_a][v_b]
                    for v_a, v_b in  x_to_y_pairs
                ])
                D_prime[comm_y][comm_x] = D_prime[comm_x][comm_y]
        return D_prime
    
    def recalculate_attr(attr_dict, comms):
        new_attr_dict = {}
        for comm, vertices in comms.items():
            avg_attr = np.mean(np.array([attr_dict[v] for v in vertices]), axis=0)
            new_attr_dict[comm] = avg_attr
        return new_attr_dict
    
    def reverse_index(P):
        comms = defaultdict(list)
        for v, comm in P.items():
            comms[comm].append(v)
        renumber_dict = {}
        for index, comm in enumerate(list(comms.keys())):
            renumber_dict[comm] = index
        renumbered_comms_dict = {
            renumber_dict[comm]: vertices for comm, vertices in comms.items()
        }
        return renumbered_comms_dict

    def map_max(twod_list, max_value):
        return [[min(max_value, x) for x in row] for row in twod_list]
    def twod_sum(twod_list):
        return sum([sum(row) for row in twod_list])

    ###
    # ilouvain procedure
    ###
    print("calculating partition")
    P = partition(G)
    comms_dict = reverse_index(P)
    A = map_max(G.get_adjacency(attribute='weight'), 1)
    print("calculating distance")
    if D is None:
        D = distance_matrix(G, attr_dict)
    # used for global modularity optimization
    print("create copies")
    ori_P = copy.deepcopy(P)
    # ori_G = copy.deepcopy(G)
    # ori_A = copy.deepcopy(A)
    # ori_D = copy.deepcopy(D)
    # ori_attr_dict = copy.deepcopy(attr_dict)
    # ori_I_vs = copy.deepcopy(I_vs)
    levels = []
    while(True):
        print("clustering begin")
        # QQ_anterior = -1000
        print("precalculate invertia")
        I_vs = {v.index: Invertia(G, attr_dict, v.index) for v in G.vs}
        K = weighted_degree(A)
        print("calculate global modularity")
        QQ_anterior = QQ(P, G, A, D, K, attr_dict, I_vs)
        print(QQ_anterior, len(levels), len(G.vs))
        moved = True
        N = G.vcount()
        I_V = Invertia(G, attr_dict)
        denom = 2*N*I_V
        while(moved):
            moved = False
            moves = {}
            for v in G.vs:
                max_QQ_comm, gain = find_max_gain_comm(v.index, G, P, A, D, K, denom, I_vs)
                # print(P[v.index], max_QQ_comm, gain)
                if max_QQ_comm != P[v.index] and gain > 0:
                    print("moving node: ", v.index, " from comm: ", P[v.index], " to comm: ", max_QQ_comm, " with gain: ", gain)
                    P[v.index] = max_QQ_comm
                    # moves[v.index] = max_QQ_comm
                    for node in comms_dict[v.index]:
                        ori_P[node] = max_QQ_comm
                    moved = True
            # for v, comm in moves.items():
            #     P[v] = comm
                
            print("one local iteration ends")
        print("local move ends")
        # global_QQ = QQ(ori_P, ori_G, ori_A, ori_D, ori_attr_dict, ori_I_vs)
        global_QQ = QQ(P, G, A, D, K, attr_dict, I_vs)
        print("new vs. previous: ", global_QQ, QQ_anterior)
        # if global_QQ > QQ_anterior:
            # merge each cluster into a node in c_G
        comms_dict = reverse_index(P)
        c_G, c_A  = fusion_matrix_adjacency(A, comms_dict)
        # TODO: figure out what variables are needed for viz at each level
        # preserve the hierarchy
        levels.append((c_G, c_A, P))
        attr_dict = recalculate_attr(attr_dict, comms_dict)
        # construct new distances between clusters
        print("fusion inertia matrix")
        D = fusion_matrix_inertia(D, comms_dict)
        P = partition(c_G)
        # assign the result to operate recursively
        if G.vcount() < 10 or G.vcount() == c_G.vcount(): break
        print("pass done. ")
        print("calculating new graph inertia")
        G = c_G
        A = c_A
        return G, attr_dict, D
        # return ori_P, levels

        # else:
        #     break
    return ori_P, levels

In [None]:
def distance_matrix(G, attr_dict):
    def dist(vec1, vec2):
        return np.linalg.norm(np.array(vec1) - np.array(vec2))**2
    # create a list of list of distances
    D = defaultdict(lambda: defaultdict(float))
    for index1, v1 in enumerate(G.vs):
        for index2, v2 in enumerate(G.vs):
            v1_index = v1.index
            v2_index = v2.index
            embedding1 = attr_dict[v1_index]
            embedding2 = attr_dict[v2_index]
            distance = dist(embedding1, embedding2)
            D[v1_index][v2_index] = distance
            D[v2_index][v1_index] = distance
    return D

In [25]:
D = distance_matrix(G_ccs, attr_dict)

In [None]:
# CG, levels = ilouvain(G_ccs, attr_dict, D)
# P, levels = ilouvain(G_ccs, attr_dict, D)
new_G, new_attr_dict, new_D = ilouvain(G_ccs, attr_dict, D)

In [26]:
def ravasz(G, attr_dict, D=None):
    def generate_node_pair(arr):
        node_pairs = []
        for v1 in arr:
            for v2 in arr:
                node_pairs.append((v1.index, v2.index))
        return node_pairs

    def weighted_degree(A):
        return {v: sum(A[v]) for v in range(0, len(A))}
    
    def weighted_common_neighbors(G, A, i, j):
        i_neighbors = G.neighbors(i)
        j_neigobors = G.neighbors(j)
        common_neighbors = list(set(i_neighbors).intersection(set(j_neigobors)))
        return sum([A[i][v] for v in common_neighbors]) + sum([A[j][v] for v in common_neighbors])


    def weighted_TO(G, A, K, i, j):
        J = weighted_common_neighbors(G, A, i, j)
        return J/ (min(K[i], K[j]) + 1 - A[i][j])
    
    def distance_matrix(G, attr_dict):
        def dist(vec1, vec2):
            return np.linalg.norm(np.array(vec1) - np.array(vec2))**2
        # create a list of list of distances
        D = defaultdict(lambda: defaultdict(float))
        for index1, v1 in enumerate(G.vs):
            for index2, v2 in enumerate(G.vs):
                v1_index = v1.index
                v2_index = v2.index
                embedding1 = attr_dict[v1_index]
                embedding2 = attr_dict[v2_index]
                distance = dist(embedding1, embedding2)
                D[v1_index][v2_index] = distance
                D[v2_index][v1_index] = distance
        return D

    def map_max(twod_list, max_value):
        return [[min(max_value, x) for x in row] for row in twod_list]

    def twod_sum(twod_list):
        return sum([sum(row) for row in twod_list])

    def partition(G):
        P = {}
        for index, v in enumerate(G.vs):
            P[v.index] = index
        return P 

    def similarity(G, A, K, D, P):
        S = defaultdict(lambda: defaultdict(float))
        list_node_pairs = generate_node_pair(G.vs)
        for i, j in list_node_pairs:
            if i == j: 
                S[i][j] = -math.inf
                continue
            connectivity_similarity = weighted_TO(G, A, K, i, j)
            semantic_similarity = 1 - D[i][j]
            S[i][j] = (connectivity_similarity + semantic_similarity) /2
        return S

    def reverse_index(P):
        comms = defaultdict(list)
        for v, comm in P.items():
            comms[comm].append(v)
        renumber_dict = {}
        for index, comm in enumerate(list(comms.keys())):
            renumber_dict[comm] = index
        renumbered_comms_dict = {
            renumber_dict[comm]: vertices for comm, vertices in comms.items()
        }
        return renumbered_comms_dict

    def calculate_weights(comm1, comm2, A):
        total = 0
        for v1 in comm1:
            for v2 in comm2:
                total += A[v1][v2]
        return total

    def fusion_matrix_adjacency(A, comms):
        print("fusion matrix adjacency comms: ", len(comms))

        new_weights = defaultdict(lambda: defaultdict(dict))
        for comm1, vertices1 in comms.items():
            for comm2, vertices2 in comms.items():
                new_weights[comm1][comm2]['weight'] = calculate_weights(vertices1, vertices2, A)
                # weights[comm2][comm1]['weight'] = weights[comm1][comm2]['weight']
        clustered_G = ig.Graph.DictDict(new_weights)
        clustered_A = map_max(clustered_G.get_adjacency(attribute='weight'), 1)
        return clustered_G, clustered_A
    
    def recalculate_attr(attr_dict, comms):
        new_attr_dict = {}
        for comm, vertices in comms.items():
            avg_attr = np.mean(np.array([attr_dict[v] for v in vertices]), axis=0)
            new_attr_dict[comm] = avg_attr
        return new_attr_dict


    levels = []
    P = partition(G)
    comms_dict = reverse_index(P)
    ori_graph_partition = P
    levels = defaultdict(list)
    level = 0
    # init levels
    for v in G.vs:
        levels[v.index].append(P[v.index])
    A = map_max(G.get_adjacency(attribute='weight'), 1)
    if D is None:
        D = distance_matrix(G, attr_dict)
    while(True):
        # init level slot
        for v, cur_levels in levels.items():
            cur_levels.append(None)
        print("clustering begin")
        print("initial nodes:", G.vcount())
        K = weighted_degree(A)
        similarity_matrix = similarity(G, A, K, D, P)
        ori_graph_comms_dict = reverse_index(ori_graph_partition)
        for v in G.vs:
            most_similar_node = max(similarity_matrix[v.index].items(), key=operator.itemgetter(1))[0]
            print("moving node: ", v.index, " from comm: ", P[v.index], " to comm: ", P[most_similar_node])
            # merge v into most_similar_node in G
            for node in ori_graph_comms_dict[P[v.index]]:
                ori_graph_partition[node] = P[most_similar_node]
                levels[node][level] = P[most_similar_node]
            for node in ori_graph_comms_dict[P[most_similar_node]]:
                ori_graph_partition[node] = P[most_similar_node]
                levels[node][level] = P[most_similar_node]
            # rewrite at G'
            P[v.index] = P[most_similar_node]

        level += 1
        print("one iteration done")
        comms_dict = reverse_index(P)
        print("total nodes in comms:", sum([len(x) for x in ori_graph_comms_dict.values()]))
        c_G, c_A  = fusion_matrix_adjacency(A, comms_dict)
        print("clusters: ", c_G.vcount())
        # TODO: figure out what variables are needed for viz at each level
        # preserve the hierarchy
        attr_dict = recalculate_attr(attr_dict, comms_dict)
        # construct new distances between clusters
        D = distance_matrix(c_G, attr_dict)
        P = partition(c_G)
        # assign the result to operate recursively
        if G.vcount() < 10 or G.vcount() == c_G.vcount(): break
        print("pass done. ")
        G = c_G
        A = c_A
    return levels


In [27]:
# levels = ravasz(G_ccs, attr_dict, D)
levels = ravasz(G_ccs, attr_dict, D)

clustering begin
initial nodes: 568
moving node:  0  from comm:  0  to comm:  17
moving node:  1  from comm:  1  to comm:  65
moving node:  2  from comm:  2  to comm:  42
moving node:  3  from comm:  3  to comm:  17
moving node:  4  from comm:  4  to comm:  11
moving node:  5  from comm:  5  to comm:  64
moving node:  6  from comm:  6  to comm:  64
moving node:  7  from comm:  7  to comm:  42
moving node:  8  from comm:  8  to comm:  128
moving node:  9  from comm:  9  to comm:  17
moving node:  10  from comm:  10  to comm:  11
moving node:  11  from comm:  11  to comm:  11
moving node:  12  from comm:  12  to comm:  11
moving node:  13  from comm:  13  to comm:  349
moving node:  14  from comm:  14  to comm:  17
moving node:  15  from comm:  15  to comm:  349
moving node:  16  from comm:  16  to comm:  17
moving node:  17  from comm:  17  to comm:  17
moving node:  18  from comm:  18  to comm:  75
moving node:  19  from comm:  19  to comm:  64
moving node:  20  from comm:  20  to comm

In [38]:
def _renumber_dict(P):
    comm_set = set(P.values())
    renumber_dict = {comm: index for index, comm in enumerate(comm_set)}
    return renumber_dict
    # P = {v: renumber_dict[comm] for v, comm in P.items()}
    # return P
    

def levels_to_partitions(G, levels):
    partitions = []
    for v in G.vs:
        levels[v.index] = levels[v.index][0:-1]
    for level in range(len(levels[0])):
        P = {}
        for v in G.vs:
            P[v['name']] = levels[v.index][level]
        renumber_dict = _renumber_dict(P)
        P = {v: renumber_dict[comm] for v, comm in P.items()}
        for v in G.vs:
            levels[v.index][level] = P[v['name']]
        partitions.append(P)
    last_partition = partitions[-1]
    comm_labels = set(last_partition.values())
    if len(comm_labels) > 1:
        partitions.append({v['name']: 0 for v in G.vs})
        for v in G.vs:
            levels[v.index].append(0)
    return partitions, levels
partitions, renumbered_levels = levels_to_partitions(G_ccs, copy.deepcopy(levels))

In [39]:
def get_level_transition(levels):
    nested_comms = {}
    for i in range(len(levels[0])-1):
        for v, transitions in levels.items():
            trans_children_title = "L-{}-{}".format(i, transitions[i])
            trans_parent_title = "L-{}-{}".format(i+1, transitions[i+1])
            # if children is the first level
            if trans_children_title not in nested_comms:
                # create leaf
                nested_comms[trans_children_title] = {
                    "title": trans_children_title,
                    "key": trans_children_title
                }
                # add to parent 
                if trans_parent_title not in nested_comms:
                    nested_comms[trans_parent_title] = {
                        "title": trans_parent_title,
                        "key": trans_parent_title,
                        'children': [nested_comms[trans_children_title]]
                    }
                # avoid adding duplicate children
                elif trans_children_title not in [child['title'] for child in nested_comms[trans_parent_title]['children']]:
                    nested_comms[trans_parent_title]['children'].append(nested_comms[trans_children_title])
            else:
                # if children is not the first level
                # add to parent directly
                if trans_parent_title not in nested_comms:
                    nested_comms[trans_parent_title] = {
                        "title": trans_parent_title,
                        "key": trans_parent_title,
                        'children': [nested_comms[trans_children_title]]
                    }
                # avoid adding duplicate children
                elif trans_children_title not in [child['title'] for child in nested_comms[trans_parent_title]['children']]:
                    nested_comms[trans_parent_title]['children'].append(nested_comms[trans_children_title])
    final_level = len(levels[0])-1
    return nested_comms['L-{}-{}'.format(final_level, 0)]
print(levels[500])
print(renumbered_levels[500])
hierarchies = get_level_transition(renumbered_levels)

[500, 167, 5, None]
[164, 0, 1, 0]


In [40]:
def save_json(data, filepath=r'new_data.json'):
   with open(filepath, 'w') as fp:
      json.dump(data, fp, indent=4)
print(partitions[1])
save_json(partitions, "data/result/RAMS/gpt_biHgraph_dev/ravasz_partitions.json")

{'108-Conflict-Crash-Death': 2, '625-Procure-Sell': 1, '772-Cooperate-Prevent-Report': 0, '550-Escalate-Proxy war-React': 2, '522-Fight-Provide-Suggest': 0, '678-Allow-Confirm-Prevent': 3, '803-Attack-Include': 3, '469-Carry out-Involve-Result in': 0, '864-Involve-Involvement': 1, '383-Believe-Effects-Lead to': 2, '26-Critcize-Suggest': 0, '355-Opinion-Require-Suggestion': 0, '563-Criticism-Decrease-Suggest': 0, '680-Implications-Involve-Lead to': 0, '163-Acquire-Ally-Bring down-Confiscate-Control': 2, '430-Declare-Halt-Respond': 0, '543-Escalate-Fight-Occur-Sponsor': 2, '349-Criticize-Execute': 2, '242-Announce-Attempt-Negotiate': 1, '786-Hope-Place-Sanction': 3, '391-phone call-presence-statement': 3, '154-Advocate-Challenge': 1, '321-Airstrike-Intensify-Target': 0, '217-Host-Need-Pledge': 3, '823-Expressed-Refused-Urged': 3, '620-Part of-Spread-Use': 1, '896-Meet with displeasure-Reaffirm-Rebuke': 0, '118-Campaign-Include-Result': 1, '741-End-Opinion': 1, '704-Believe-Deploy': 1, '9

In [31]:
hierarchies
save_json(hierarchies, "data/result/RAMS/gpt_biHgraph_dev/ravasz_hierarchies.json")

In [None]:
def map_max(twod_list, max_value):
    return [[min(max_value, x) for x in row] for row in twod_list]

def weighted_degree(A):
    return {v: sum(A[v]) for v in range(0, len(A))}

def partition(G):
    P = {}
    for index, v in enumerate(G.vs):
        P[v.index] = index
    return P 

def similarity(G, A, K, D, P):
    S = defaultdict(lambda: defaultdict(float))
    list_node_pairs = generate_node_pair(G.vs)
    for i, j in list_node_pairs:
        if i == j: 
            S[i][j] = -math.inf
            continue
        connectivity_similarity = weighted_TO(G, A, K, i, j)
        semantic_similarity = 1 - D[i][j]
        S[i][j] = (connectivity_similarity + semantic_similarity) /2
    return S

def generate_node_pair(arr):
    node_pairs = []
    for v1 in arr:
        for v2 in arr:
            node_pairs.append((v1.index, v2.index))
    return node_pairs

def weighted_degree(A):
    return {v: sum(A[v]) for v in range(0, len(A))}

def weighted_common_neighbors(G, A, i, j):
    i_neighbors = G.neighbors(i)
    j_neigobors = G.neighbors(j)
    common_neighbors = list(set(i_neighbors).intersection(set(j_neigobors)))
    return sum([A[i][v] for v in common_neighbors]) + sum([A[j][v] for v in common_neighbors])


def weighted_TO(G, A, K, i, j):
    J = weighted_common_neighbors(G, A, i, j)
    return J/ (min(K[i], K[j]) + 1 - A[i][j])

G = G_ccs
A = map_max(G.get_adjacency(attribute='weight'), 1)
K = weighted_degree(A)
D = D
P = partition(G)
similarity_matrix = similarity(G, A, K, D, P)

In [None]:
def distance_matrix(G, attr_dict):
    def dist(vec1, vec2):
        return np.linalg.norm(np.array(vec1) - np.array(vec2))**2
    # create a list of list of distances
    D = defaultdict(lambda: defaultdict(float))
    for index1, v1 in enumerate(G.vs):
        for index2, v2 in enumerate(G.vs):
            v1_index = v1.index
            v2_index = v2.index
            embedding1 = attr_dict[v1_index]
            embedding2 = attr_dict[v2_index]
            distance = dist(embedding1, embedding2)
            D[v1_index][v2_index] = distance
            D[v2_index][v1_index] = distance
    return D
test_D = distance_matrix(new_G, new_attr_dict)
