## Clustering on the event hyper graph
### Node similarity measure:
- topological overlap + event/sentence embeddings
### Group similarity measure:
- average similarity of node pairs between cluster pairs
### Procedure:
- convert hypergraph to weighted normal graph (or don't?)
- Assign each node to its own cluster and evaluate similarity measure for all node pairs
- merge node pairs with highest similarity measure into the same community 
- - how many pairs to merge?
- repeat by merging clusters in the same way until no merge is available
### dual of hypergraph
- link clustering can be achieved by doing clustering on the dual of a hypergraph

In [3]:
import networkx as nx
import hypernetx as hnx
import numpy as np
from scipy import spatial
from scipy.sparse import csr_matrix

import json
import hypernetx.algorithms.hypergraph_modularity as hmod
import igraph as ig
from collections import defaultdict
from itertools import combinations

import itertools
import copy
import time 
import sys
import operator
import math

 No module named 'celluloid'. If you need to use hypernetx.algorithms.contagion, please install additional packages by running the following command: pip install .['all']


In [None]:
# read network
# RAMS
# B = nx.node_link_graph(json.load(open('data/result/RAMS/gpt_biHgraph_dev/hgraph.json')))
# AllTheNews
B = nx.node_link_graph(json.load(open('data/result/AllTheNews/network/hgraph.json')))

In [51]:
H = hnx.Hypergraph.from_bipartite(B)
H.shape

(58582, 7542)

In [67]:
singletons = [node for node in H.nodes if H.degree(node) == 1]
print(len(singletons))

50781


In [68]:
cleaned_H = H.remove_nodes(singletons)

In [69]:
print(H.shape)
print(cleaned_H.shape)

(58582, 7542)
(7801, 7372)


## reduce hypergraph to two-section graph with edge reweighting proposed in [1]


[1] Kumar T., Vaidyanathan S., Ananthapadmanabhan H., Parthasarathy S. and Ravindran B. “A New Measure of Modularity in Hypergraphs: Theoretical Insights and Implications for Effective Clustering”. In: Cherifi H., Gaito S., Mendes J., Moro E., Rocha L. (eds) Complex Networks and Their Applications VIII. COMPLEX NETWORKS 2019. Studies in Computational Intelligence, vol 881. Springer, Cham

In [70]:
# hyperedge_dict = json.load(open('data/result/RAMS/gpt_biHgraph_dev/hyperedges_w_embeddings.json'))
embeddings = json.load(open('data/raw/AllTheNews/embeddings/2016_10p.json'))
hyperedge_dict = json.load(open('data/result/AllTheNews/network/hyperedges.json'))

In [72]:
# clustering on hyperedges
dual_H = cleaned_H.dual()
print(dual_H.shape)

(7372, 7801)


In [73]:
component_subgraphs = dual_H.s_component_subgraphs(edges=False, return_singletons=True)
G_ccs = ig.Graph()
weights = defaultdict(lambda: defaultdict(dict))
total = 0
event_set = set()
total_edges = 0
print("finding connected components...")
for s_component in component_subgraphs:
    print("total: ", total)
    total += s_component.shape[0]
    if s_component.shape[0] == 1:
        event = list(s_component.nodes())[0]
        event_name = "-".join(v['name'].split('-')[1:])
        if event_name not in event_set:
            G_ccs.add_vertices(list(s_component.nodes()))
        continue
    print("component_size: ", s_component.shape[0])
    print("reweighting...")
    cc = hmod.two_section(s_component)
    print("two_section graph size:", cc.vcount())
    index2id_dict = {}
    for v in cc.vs:
        index2id_dict[v.index] = v['name']

    deleted_vertices = []
    for v in cc.vs:
        event_name = "-".join(v['name'].split('-')[1:])
        if event_name in event_set:
            deleted_vertices.append(v['name'])
        event_set.add(event_name)

    # cc.delete_vertices(deleted_vertices)

    deleted_edges = []
    for e in cc.es:
        if index2id_dict[e.source] in deleted_vertices or index2id_dict[e.target] in deleted_vertices:
            deleted_edges.append((e.source, e.target))
    cc.delete_edges(deleted_edges)

    # edges = [(e.source, e.target, e['weight']) for e in cc.es]
    # G_ccs.add_vertices([v['name'] for v in cc.vs])
    total_edges += len(cc.es)

    # G_ccs.add_edges([(index2id_dict[e.source], index2id_dict[e.target]) for e in cc.es])
    for v in cc.vs:
        weights[v['name']][v['name']]['weight'] = 0
    if len(cc.es) != 0:
        for e in cc.es:
            weights[cc.vs[e.source]['name']][cc.vs[e.target]['name']]['weight'] = e['weight']
            weights[cc.vs[e.target]['name']][cc.vs[e.source]['name']]['weight'] = e['weight']
    # if total >= 4000:
    #     break

# print([G_cc.vcount() for G_cc in G_ccs])
# print(G_ccs.vcount())
print(total_edges, len(weights))
# GU = ig.union(G_ccs)

finding connected components...
getting s_components
getting linegraph
getting connected components
total:  0
component_size:  7370
reweighting...
two_section graph size: 7370
total:  7370
component_size:  2
reweighting...
two_section graph size: 2
1499122 7372


In [74]:
G_ccs = ig.Graph.DictDict(weights)
print(G_ccs.vcount(), dual_H.shape)

7372 (7372, 7801)


In [8]:
missing_nodes = 0
all_event_nodes = [node for node in dual_H.nodes()]
largest_cc_nodes = [v['name'] for v in G_ccs.vs]
for event_name in all_event_nodes:
    if event_name not in largest_cc_nodes:
        G_ccs.add_vertex(event_name)

In [75]:
id2index_dict = {}
for v in G_ccs.vs:
    id2index_dict[v['name']] = v.index

In [76]:
A = G_ccs.get_adjacency(attribute='weight')

In [77]:
# hyperedge_embeddings = {hyperedge['id']: embedding_vec2dict(hyperedge['embedding']) for hyperedge in hyperedge_dict.values()}
embeddings_dict = {embedding['id']: embedding for embedding in embeddings}
hyperedge_embeddings = {hyperedge_id: embeddings_dict[hyperedge_data['doc_id']]['embedding'] for hyperedge_id, hyperedge_data in hyperedge_dict.items()}
attr_dict = {v.index: hyperedge_embeddings[v['name']] for v in G_ccs.vs}

In [None]:
def ilouvain(G, attr_dict, D=None):
    """
    Modified version of the Louvain algorithm that takes embeddings into account
    """
    def generate_node_pair(arr):
        node_pairs = []
        for v1 in arr:
            for v2 in arr:
                node_pairs.append((v1, v2))
        return node_pairs

    def kro(c1, c2):
        return 1 if c1 == c2 else 0
    
    def weighted_degree(A):
        return {v: sum(A[v]) for v in range(0, len(A))}

    def QQ(P, G, A, D, K, attr_dict, I_vs):
        m = twod_sum(A)/2
        N = G.vcount()
        list_node_pairs = generate_node_pair([v.index for v in G.vs])
        print("node_pairs: ", len(list_node_pairs))
        I_V = Invertia(G, attr_dict)
        I_denominator_1 = (2*N*I_V)
        I_denominator_2 = (I_denominator_1)**2
        QQ_list = [
            (0,0) if kro(P[v1], P[v2]) == 0 else
            # Q_NG
            ((A[v1][v2] - K[v1]*K[v2])/(2*m)
            ,
            # Q_Invertia
            (I_vs[v1] * I_vs[v2]/(I_denominator_1) 
            - D[v1][v2]/(I_denominator_2)
            ))
            for v1, v2 in list_node_pairs
        ]
        original_stdout = sys.stdout # Save a reference to the original standard output
        with open('QQ_list.txt', 'w') as f:
            sys.stdout = f # Change the standard output to the file we created.
            print(QQ_list)
            sys.stdout = original_stdout # Reset the standard output to its 

        return np.sum([QQ_value[0] + QQ_value[1] for QQ_value in QQ_list])

        return np.sum([
            0 if kro(P[v1], P[v2]) == 0 else
            # Q_NG
            (2*m*min(A[v1][v2], 1) - G.degree(v1)*G.degree(v2))
            +
            # Q_Invertia
            (I_vs[v1] * I_vs[v2]/(I_denominator_1) 
            - D[v1][v2]/(I_denominator_2)
            )
            for v1, v2 in list_node_pairs
        ])
        
        # return Q_NG(P, G, A) + Q_Invertia(P, G, D, attr_dict)

    # def Q_NG(P, G, A):
    #     m = G.ecount()
    #     list_node_pairs = generate_node_pair([v.index for v in G.vs])
    #     return np.sum([(A[v1][v2] - G.degree(v1)*G.degree(v2)/(2*m)) * kro(P[v1], P[v2]) for v1, v2 in list_node_pairs])

    # def Q_Invertia(P, G, D, attr_dict):
    #     list_node_pairs = generate_node_pair([v.index for v in G.vs])
    #     N = len(G.vs)
    #     return np.sum([
    #         (Invertia(G, attr_dict, v1) * Invertia(G, attr_dict, v2)/((2*N*Invertia(G, attr_dict))**2) 
    #         - D[v1][v2]/(2*N*Invertia(G, attr_dict))
    #         ) * kro(P[v1], P[v2])
    #         for v1, v2 in list_node_pairs
    #     ])
    
    def Invertia(G, attr_dict, vp=None):
        N = G.vcount()
        if vp is None:
            g = np.sum([np.array(attr_dict[v.index]) for v in G.vs]) / N
            I = np.sum([np.linalg.norm(np.array(attr_dict[v.index])-g)**2 for v in G.vs])
            return I
        else:
            return np.sum([np.linalg.norm(np.array(attr_dict[vp]) - np.array(attr_dict[v.index]))**2 for v in G.vs]) 

    def delta_modular(A, K, C_x, C_1, x, m):
        # print(len(A), len(B))
        first_term = np.sum([
            A[v][x] - K[v]*K[x]/(2*m)
            for v in C_x
        ]) / m
        second_term = np.sum([
            A[v][x] - K[v]*K[x]/(2*m)
            for v in C_1
        ]) / m
        # print("delta: ", first_term, second_term)
        return first_term - second_term

    def delta_invertia(C_x, C_1, D, I_V_u, u, denom, I_vs):
        # print(len(A), len(B))
        first_term = np.sum([
            I_V_u * I_vs[v]/denom - D[u][v]
            for v in C_x
        ]) / (denom/2) 
        second_term = np.sum([
            I_V_u * I_vs[v]/denom - D[u][v]
            for v in C_1
        ]) / (denom/2)
        # print("delta: ", first_term, second_term)
        return first_term - second_term


    def semantic_neighbors(G, v, D):
        connectivity_neighbors = G.neighbors(v)
        semantic_neighbors = []
        for v2, distance in D[v].items():
            if distance < 0.4:
                semantic_neighbors.append(v2)
        return list(set(connectivity_neighbors + semantic_neighbors))

    def find_max_gain_comm(v, G, P, A, D, K, denom, I_vs):
        comms = defaultdict(list)
        for v_p, comm in P.items():
            comms[comm].append(v_p)
        neighbors = semantic_neighbors(G, v, D)
        max_gain = -1
        max_gain_comm = P[v]
        I_V_u = I_vs[v]
        m = twod_sum(A)/2
        gains = []
        for neighbor in neighbors:
            # new_QQ = QQ(P, G, A, D, attr_dict)
            C_x = comms[P[v]][:]
            C_x.remove(v)
            C_1 = comms[P[neighbor]][:]
            # C_1.append(v)
            # neighbor_start = time.process_time()
            d_modular = delta_modular(A, K, C_x, C_1, v, m)
            d_inertia = delta_invertia(C_x, C_1, D, I_V_u, v, denom, I_vs)
            gains.append((d_modular, d_inertia))
            QQ_gain = d_modular + d_inertia
            # print(invertia_gain, max_gain)
            # neighbor_duration = time.process_time() - neighbor_start
            # print("neighbor_duration: ", neighbor_duration)
            if QQ_gain > max_gain:
                max_gain = QQ_gain
                max_gain_comm = P[neighbor]
        original_stdout = sys.stdout # Save a reference to the original standard output
        with open('gains.txt', 'w') as f:
            sys.stdout = f # Change the standard output to the file we created.
            print(v)
            print(gains)
            sys.stdout = original_stdout # Reset the standard output to its 
        return max_gain_comm, max_gain

    def partition(G):
        P = {}
        for index, v in enumerate(G.vs):
            P[v.index] = index
        return P 

    def distance_matrix(G, attr_dict):
        def dist(vec1, vec2):
            return np.linalg.norm(np.array(vec1) - np.array(vec2))**2
        # create a list of list of distances
        D = defaultdict(lambda: defaultdict(float))
        for index1, v1 in enumerate(G.vs):
            for index2, v2 in enumerate(G.vs):
                v1_index = v1.index
                v2_index = v2.index
                embedding1 = attr_dict[v1_index]
                embedding2 = attr_dict[v2_index]
                distance = dist(embedding1, embedding2)
                D[v1_index][v2_index] = distance
                D[v2_index][v1_index] = distance
        return D
    
    def calculate_weights(comm1, comm2, A):
        total = 0
        for v1 in comm1:
            for v2 in comm2:
                total += A[v1][v2]
        return total


    def fusion_matrix_adjacency(A, comms):
        print("fusion matrix adjacency comms: ", len(comms))

        new_weights = defaultdict(lambda: defaultdict(dict))
        for comm1, vertices1 in comms.items():
            for comm2, vertices2 in comms.items():
                new_weights[comm1][comm2]['weight'] = calculate_weights(vertices1, vertices2, A)
                # weights[comm2][comm1]['weight'] = weights[comm1][comm2]['weight']
        print("new graph weight shape:", len(new_weights), len(new_weights[0]))
        clustered_G = ig.Graph.DictDict(new_weights)
        clustered_A = map_max(clustered_G.get_adjacency(attribute='weight'), 1)
        print("new graph shape:", clustered_G.vcount())
        return clustered_G, clustered_A

    def fusion_matrix_inertia(D, comms):
        D_prime = defaultdict(lambda: defaultdict(float))
        print("inertia matrix len: ", len(comms))
        for comm_x, x_vertices in comms.items():
            for comm_y, y_vertices in comms.items():
                x_to_y_pairs = list(itertools.product(x_vertices, y_vertices))
                D_prime[comm_x][comm_y] = np.sum([
                    D[v_a][v_b]
                    for v_a, v_b in  x_to_y_pairs
                ])
                D_prime[comm_y][comm_x] = D_prime[comm_x][comm_y]
        return D_prime
    
    def recalculate_attr(attr_dict, comms):
        new_attr_dict = {}
        for comm, vertices in comms.items():
            avg_attr = np.mean(np.array([attr_dict[v] for v in vertices]), axis=0)
            new_attr_dict[comm] = avg_attr
        return new_attr_dict
    
    def reverse_index(P):
        comms = defaultdict(list)
        for v, comm in P.items():
            comms[comm].append(v)
        renumber_dict = {}
        for index, comm in enumerate(list(comms.keys())):
            renumber_dict[comm] = index
        renumbered_comms_dict = {
            renumber_dict[comm]: vertices for comm, vertices in comms.items()
        }
        return renumbered_comms_dict

    def map_max(twod_list, max_value):
        return [[min(max_value, x) for x in row] for row in twod_list]
    def twod_sum(twod_list):
        return sum([sum(row) for row in twod_list])

    ###
    # ilouvain procedure
    ###
    print("calculating partition")
    P = partition(G)
    comms_dict = reverse_index(P)
    A = map_max(G.get_adjacency(attribute='weight'), 1)
    print("calculating distance")
    if D is None:
        D = distance_matrix(G, attr_dict)
    # used for global modularity optimization
    print("create copies")
    ori_P = copy.deepcopy(P)
    # ori_G = copy.deepcopy(G)
    # ori_A = copy.deepcopy(A)
    # ori_D = copy.deepcopy(D)
    # ori_attr_dict = copy.deepcopy(attr_dict)
    # ori_I_vs = copy.deepcopy(I_vs)
    levels = []
    while(True):
        print("clustering begin")
        # QQ_anterior = -1000
        print("precalculate invertia")
        I_vs = {v.index: Invertia(G, attr_dict, v.index) for v in G.vs}
        K = weighted_degree(A)
        print("calculate global modularity")
        QQ_anterior = QQ(P, G, A, D, K, attr_dict, I_vs)
        print(QQ_anterior, len(levels), len(G.vs))
        moved = True
        N = G.vcount()
        I_V = Invertia(G, attr_dict)
        denom = 2*N*I_V
        while(moved):
            moved = False
            moves = {}
            for v in G.vs:
                max_QQ_comm, gain = find_max_gain_comm(v.index, G, P, A, D, K, denom, I_vs)
                # print(P[v.index], max_QQ_comm, gain)
                if max_QQ_comm != P[v.index] and gain > 0:
                    print("moving node: ", v.index, " from comm: ", P[v.index], " to comm: ", max_QQ_comm, " with gain: ", gain)
                    P[v.index] = max_QQ_comm
                    # moves[v.index] = max_QQ_comm
                    for node in comms_dict[v.index]:
                        ori_P[node] = max_QQ_comm
                    moved = True
            # for v, comm in moves.items():
            #     P[v] = comm
                
            print("one local iteration ends")
        print("local move ends")
        # global_QQ = QQ(ori_P, ori_G, ori_A, ori_D, ori_attr_dict, ori_I_vs)
        global_QQ = QQ(P, G, A, D, K, attr_dict, I_vs)
        print("new vs. previous: ", global_QQ, QQ_anterior)
        # if global_QQ > QQ_anterior:
            # merge each cluster into a node in c_G
        comms_dict = reverse_index(P)
        c_G, c_A  = fusion_matrix_adjacency(A, comms_dict)
        # TODO: figure out what variables are needed for viz at each level
        # preserve the hierarchy
        levels.append((c_G, c_A, P))
        attr_dict = recalculate_attr(attr_dict, comms_dict)
        # construct new distances between clusters
        print("fusion inertia matrix")
        D = fusion_matrix_inertia(D, comms_dict)
        P = partition(c_G)
        # assign the result to operate recursively
        if G.vcount() < 10 or G.vcount() == c_G.vcount(): break
        print("pass done. ")
        print("calculating new graph inertia")
        G = c_G
        A = c_A
        return G, attr_dict, D
        # return ori_P, levels

        # else:
        #     break
    return ori_P, levels

In [78]:
def distance_matrix(G, attr_dict):
    def dist(vec1, vec2):
        return spatial.distance.cosine(vec1, vec2)
        # return np.linalg.norm(np.array(vec1) - np.array(vec2))**2
    embeddings = np.array([attr_dict[v.index] for v in G.vs])
    return spatial.distance.cdist(embeddings, embeddings, metric='cosine')
    # create a list of list of distances
    # D = defaultdict(lambda: defaultdict(float))
    # for index1, v1 in enumerate(G.vs):
    #     for index2, v2 in enumerate(G.vs):
    #         v1_index = v1.index
    #         v2_index = v2.index
    #         embedding1 = attr_dict[v1_index]
    #         embedding2 = attr_dict[v2_index]
    #         distance = dist(embedding1, embedding2)
    #         D[v1_index][v2_index] = distance
    #         D[v2_index][v1_index] = distance
    return D

In [79]:
D = distance_matrix(G_ccs, attr_dict)

In [17]:
S = 1 - D

In [18]:
for i, j in [0, 1]:
   J_i_j = weighted_common_neighbors(G, A, i, j)
   CS_i_j = J_i_j / (min(K[i], K[j]) + 1 - A[i][j])

0.7180868052652538 0.2819131947347462


In [None]:
# CG, levels = ilouvain(G_ccs, attr_dict, D)
# P, levels = ilouvain(G_ccs, attr_dict, D)
new_G, new_attr_dict, new_D = ilouvain(G_ccs, attr_dict, D)

In [97]:
def ravasz(G, attr_dict, D=None):
    def generate_node_pair(arr):
        node_pairs = []
        for v1 in arr:
            for v2 in arr:
                node_pairs.append((v1.index, v2.index))
        return node_pairs

    def weighted_degree(A):
        return {v: sum(A[v]) for v in range(0, len(A))}
    
    def weighted_common_neighbors(i, j,  G, A):
        i_neighbors = G.neighbors(i)
        j_neigobors = G.neighbors(j)
        common_neighbors = list(set(i_neighbors).intersection(set(j_neigobors)))
        return sum([A[i][v] for v in common_neighbors]) + sum([A[j][v] for v in common_neighbors])


    def weighted_TO(i, j, G, A, K, J):
        # J = weighted_common_neighbors(G, A, i, j)
        return J[i][j]/ (min(K[i], K[j]) + 1 - A[i][j])
    # def weighted_TO(i, j, x, y, G, A):
    #     J = weighted_common_neighbors(G, A, i, j)
    #     return J/ (min(x, y) + 1 - A[i][j])
    
    # def distance_matrix(G, attr_dict):
    #     def dist(vec1, vec2):
    #         return np.linalg.norm(np.array(vec1) - np.array(vec2))**2
    #     # create a list of list of distances
    #     D = defaultdict(lambda: defaultdict(float))
    #     for index1, v1 in enumerate(G.vs):
    #         for index2, v2 in enumerate(G.vs):
    #             v1_index = v1.index
    #             v2_index = v2.index
    #             embedding1 = attr_dict[v1_index]
    #             embedding2 = attr_dict[v2_index]
    #             distance = dist(embedding1, embedding2)
    #             D[v1_index][v2_index] = distance
    #             D[v2_index][v1_index] = distance
    #     return D

    def map_max(twod_list, max_value):
        return [[min(max_value, x) for x in row] for row in twod_list]

    def twod_sum(twod_list):
        return sum([sum(row) for row in twod_list])

    def partition(G):
        P = {}
        for index, v in enumerate(G.vs):
            P[v.index] = index
        return P 

    def similarity(G, A, K, D):
        # S = defaultdict(lambda: defaultdict(float))
        # list_node_pairs = generate_node_pair(G.vs)
        # for i, j in list_node_pairs:
        #     if i == j: 
        #         S[i][j] = -math.inf
        #         continue
        #     connectivity_similarity = weighted_TO(G, A, K, i, j)
        #     semantic_similarity = 1 - D[i][j]
        # S[i][j] = (connectivity_similarity + semantic_similarity) /2
        SS = 1 - D
        n = G.vcount()
        for i in range(n):
            SS[i][i] = -math.inf
        print("generating combinations")
        # combinations_indices = combinations(range(n), 2)
        row_indices = np.arange(n)
        print("generating J")
        print("multiplying A and A.T")
        sparse_A = csr_matrix(np.array(A))
        sparse_common_neighbors_matrix = sparse_A.dot(sparse_A.T)
        dense_common_neighbors_matrix = sparse_common_neighbors_matrix.A
        # Compute the weighted common neighbors matrix
        print("summing outer weights")
        J = np.add.outer(dense_common_neighbors_matrix, dense_common_neighbors_matrix)
        # J = np.vectorize(weighted_common_neighbors, excluded=['G', 'A'])(
        #     row_indices[:, np.newaxis],
        #     row_indices,
        #     G=G,
        #     A=A
        # )
        print("generating CS")
        CS = np.vectorize(weighted_TO, excluded=['G', 'A', 'K', 'J'])(
            row_indices[:, np.newaxis], 
            row_indices,
            G=G,
            A=A,
            K=K,
            J=J
        )

        # return SS
        return (SS + CS) / 2

    def reverse_index(P):
        comms = defaultdict(list)
        for v, comm in P.items():
            comms[comm].append(v)
        renumber_dict = {}
        for index, comm in enumerate(list(comms.keys())):
            renumber_dict[comm] = index
        renumbered_comms_dict = {
            renumber_dict[comm]: vertices for comm, vertices in comms.items()
        }
        return renumbered_comms_dict

    def calculate_weights(comm1, comm2, A):
        total = 0
        for v1 in comm1:
            for v2 in comm2:
                total += A[v1][v2]
        return total

    def fusion_matrix_adjacency(A, comms):
        print("fusion matrix adjacency comms: ", len(comms))

        new_weights = defaultdict(lambda: defaultdict(dict))
        for comm1, vertices1 in comms.items():
            for comm2, vertices2 in comms.items():
                new_weights[comm1][comm2]['weight'] = calculate_weights(vertices1, vertices2, A)
                # weights[comm2][comm1]['weight'] = weights[comm1][comm2]['weight']
        clustered_G = ig.Graph.DictDict(new_weights)
        clustered_A = map_max(clustered_G.get_adjacency(attribute='weight'), 1)
        return clustered_G, clustered_A
    
    def recalculate_attr(attr_dict, comms):
        new_attr_dict = {}
        for comm, vertices in comms.items():
            avg_attr = np.mean(np.array([attr_dict[v] for v in vertices]), axis=0)
            new_attr_dict[comm] = avg_attr
        return new_attr_dict


    levels = []
    P = partition(G)
    comms_dict = reverse_index(P)
    ori_graph_partition = P
    levels = defaultdict(list)
    level = 0
    # init levels
    for v in G.vs:
        levels[v.index].append(P[v.index])
    A = map_max(G.get_adjacency(attribute='weight'), 1)
    if D is None:
        D = distance_matrix(G, attr_dict)
    while(True):
        # init level slot
        for v, cur_levels in levels.items():
            cur_levels.append(None)
        print("clustering begin")
        print("initial nodes:", G.vcount())
        print("calculating weighted_degree")
        K = weighted_degree(A)
        print("calculating similarity matrix")
        similarity_matrix = similarity(G, A, K, D)
        print("calculating reverse index of G")
        ori_graph_comms_dict = reverse_index(ori_graph_partition)
        for v in G.vs:
            print("finding most similar node")
            # most_similar_node = max(similarity_matrix[v.index].items(), key=operator.itemgetter(1))[0]
            # most_similar_node = max(similarity_matrix[v.index], key=operator.itemgetter(1))
            most_similar_node = max(range(len(similarity_matrix[v.index])), key=similarity_matrix[v.index].__getitem__)

            print("moving node: ", v.index, " from comm: ", P[v.index], " to comm: ", P[most_similar_node])
            # merge v into most_similar_node in G
            for node in ori_graph_comms_dict[P[v.index]]:
                ori_graph_partition[node] = P[most_similar_node]
                levels[node][level] = P[most_similar_node]
            for node in ori_graph_comms_dict[P[most_similar_node]]:
                ori_graph_partition[node] = P[most_similar_node]
                levels[node][level] = P[most_similar_node]
            # rewrite at G'
            P[v.index] = P[most_similar_node]

        level += 1
        print("one iteration done")
        comms_dict = reverse_index(P)
        print("total nodes in comms:", sum([len(x) for x in ori_graph_comms_dict.values()]))
        c_G, c_A  = fusion_matrix_adjacency(A, comms_dict)
        print("clusters: ", c_G.vcount())
        # TODO: figure out what variables are needed for viz at each level
        # preserve the hierarchy
        attr_dict = recalculate_attr(attr_dict, comms_dict)
        # construct new distances between clusters
        D = distance_matrix(c_G, attr_dict)
        P = partition(c_G)
        # assign the result to operate recursively
        if G.vcount() < 10 or G.vcount() == c_G.vcount(): break
        print("pass done. ")
        G = c_G
        A = c_A
    return levels


In [98]:
# levels = ravasz(G_ccs, attr_dict, D)
levels = ravasz(G_ccs, attr_dict, D)

clustering begin
initial nodes: 7372
calculating weighted_degree
calculating similarity matrix
generating combinations
generating J
multiplying A and A.T
summing outer weights


MemoryError: Unable to allocate 21.0 PiB for an array with shape (7372, 7372, 7372, 7372) and data type float64

In [42]:
def _renumber_dict(P):
    comm_set = set(P.values())
    renumber_dict = {comm: index for index, comm in enumerate(comm_set)}
    return renumber_dict
    # P = {v: renumber_dict[comm] for v, comm in P.items()}
    # return P
    

def levels_to_partitions(G, levels):
    partitions = []
    for v in G.vs:
        levels[v.index] = levels[v.index][0:-1]
    for level in range(len(levels[0])):
        P = {}
        for v in G.vs:
            P[v['name']] = levels[v.index][level]
        renumber_dict = _renumber_dict(P)
        P = {v: renumber_dict[comm] for v, comm in P.items()}
        for v in G.vs:
            levels[v.index][level] = P[v['name']]
        partitions.append(P)
    last_partition = partitions[-1]
    comm_labels = set(last_partition.values())
    if len(comm_labels) > 1:
        partitions.append({v['name']: 0 for v in G.vs})
        for v in G.vs:
            levels[v.index].append(0)
    return partitions, levels
partitions, renumbered_levels = levels_to_partitions(G_ccs, copy.deepcopy(levels))

In [7]:
def add_dummy_partition(partitions):
    first_partition = partitions[0]
    dummy_partition = {}
    for index, node_id in enumerate(list(first_partition.keys())):
        dummy_partition[node_id] = index
    partitions.insert(0, dummy_partition)
    return partitions
partitions = json.load(open('data/result/AllTheNews/network/ravasz_partitions_old.json', 'r'))
partitions = add_dummy_partition(partitions)
save_json(partitions, 'data/result/AllTheNews/network/ravasz_partitions.json')

In [17]:
def dfs(hierarchy, leaf_children_dict):
    cur_level_label = hierarchy['title'].split("-")[1]
    cur_cluster_label = hierarchy['title'].split("-")[2]
    new_level_label = str(int(cur_level_label) + 1)
    hierarchy['title'] = "L-{}-{}".format(new_level_label, cur_cluster_label)
    hierarchy['key'] = "L-{}-{}".format(new_level_label, cur_cluster_label)
    if 'children' in hierarchy:
        for child in hierarchy['children']:
            dfs(child, leaf_children_dict)
    else:
        dummy_clusters = leaf_children_dict[cur_cluster_label]
        print(dummy_clusters, cur_cluster_label)
        hierarchy['children'] = []
        for dummy_cluster_label in dummy_clusters:
            hierarchy['children'].append({ 
                "title": "L-0-{}".format(dummy_cluster_label),
                "key": "L-0-{}".format(dummy_cluster_label),
            })
    return

def add_dummy_hierarchy(partitions, hierarchies):
    first_partition = partitions[0]
    second_partition = partitions[1]
    second_level_children_dict = defaultdict(list)
    for node_id, dummy_cluster_label in first_partition.items():
        parent_cluster_label = second_partition[node_id]
        second_level_children_dict[str(parent_cluster_label)].append(dummy_cluster_label)
    print(second_level_children_dict)
    dfs(hierarchies, second_level_children_dict)
    return hierarchies
hierarchies = json.load(open("data/result/AllThenews/network/ravasz_hierarchies_old.json"))
new_hierarchies = add_dummy_hierarchy(partitions, hierarchies)
save_json(new_hierarchies, "data/result/AllTheNews/network/ravasz_hierarchies.json")

defaultdict(<class 'list'>, {'2': [0, 22, 23, 24, 28, 2988, 3598, 4895], '0': [1], '8': [2, 108], '1304': [3, 4696], '115': [4, 747, 4814, 5172, 5618, 6173, 6533, 7214], '542': [5, 195, 903, 1717, 2270, 2305, 2306, 2365, 2823, 4143], '1540': [6, 2987, 5475, 5586, 5667, 6843], '617': [7, 2527, 4808], '1098': [8], '441': [9, 71, 1806, 5271], '513': [10, 2135, 2159, 4763, 5597, 6096, 6348], '235': [11], '14': [12], '2167': [13, 5518, 5923], '1': [14], '1861': [15, 1329, 1635, 3603, 6271, 6329, 6356, 6374], '64': [16, 497, 504, 506, 508, 509, 513, 1863, 1872, 1874, 2351, 3589], '604': [17, 2441, 2454, 2480, 2481, 3440, 4681, 6864], '1216': [18, 3551, 3953, 4096, 4461, 5544], '892': [19], '445': [20], '1505': [21, 3108, 3201, 3889, 4028, 5376, 5387, 7244], '485': [25, 2040, 2041], '1114': [26, 1557, 3475, 4152, 4453, 5276, 5316, 5903, 6382], '884': [27, 816, 1082, 3459, 4369, 6926, 6960, 7488, 7503], '2018': [29, 6697], '111': [30, 410, 721, 1349, 2280, 3221, 3328], '839': [31, 3292, 6736],

In [9]:
def get_level_transition(levels):
    nested_comms = {}
    for i in range(len(levels[0])-1):
        for v, transitions in levels.items():
            trans_children_title = "L-{}-{}".format(i, transitions[i])
            trans_parent_title = "L-{}-{}".format(i+1, transitions[i+1])
            # if children is the first level
            if trans_children_title not in nested_comms:
                # create leaf
                nested_comms[trans_children_title] = {
                    "title": trans_children_title,
                    "key": trans_children_title
                }
                # add to parent 
                if trans_parent_title not in nested_comms:
                    nested_comms[trans_parent_title] = {
                        "title": trans_parent_title,
                        "key": trans_parent_title,
                        'children': [nested_comms[trans_children_title]]
                    }
                # avoid adding duplicate children
                elif trans_children_title not in [child['title'] for child in nested_comms[trans_parent_title]['children']]:
                    nested_comms[trans_parent_title]['children'].append(nested_comms[trans_children_title])
            else:
                # if children is not the first level
                # add to parent directly
                if trans_parent_title not in nested_comms:
                    nested_comms[trans_parent_title] = {
                        "title": trans_parent_title,
                        "key": trans_parent_title,
                        'children': [nested_comms[trans_children_title]]
                    }
                # avoid adding duplicate children
                elif trans_children_title not in [child['title'] for child in nested_comms[trans_parent_title]['children']]:
                    nested_comms[trans_parent_title]['children'].append(nested_comms[trans_children_title])
    final_level = len(levels[0])-1
    return nested_comms['L-{}-{}'.format(final_level, 0)]
print(renumbered_levels[500])
hierarchies = get_level_transition(renumbered_levels)

NameError: name 'renumbered_levels' is not defined

In [5]:
def save_json(data, filepath=r'new_data.json'):
   with open(filepath, 'w') as fp:
      json.dump(data, fp, indent=4)

In [44]:
print(partitions[1])
save_json(partitions, "data/result/AllTheNews/network/ravasz_partitions.json")
# save_json(partitions, "data/result/RAMS/gpt_biHgraph_dev/ravasz_partitions.json")



In [45]:
hierarchies
save_json(hierarchies, "data/result/AllThenews/network/ravasz_hierarchies.json")
# save_json(hierarchies, "data/result/RAMS/gpt_biHgraph_dev/ravasz_hierarchies.json")