In [1]:
import networkx as nx
import hypernetx as hnx
import numpy as np
from scipy import spatial
from scipy.sparse import csr_matrix

import json
import hypernetx.algorithms.hypergraph_modularity as hmod
import igraph as ig
from collections import defaultdict
from itertools import combinations

import itertools
import copy
import time
import sys
import operator
import math

 No module named 'celluloid'. If you need to use hypernetx.algorithms.contagion, please install additional packages by running the following command: pip install .['all']


In [2]:
frontend_data = json.load(open('data/result/AllTheNews/network/server/frontend_2.json'))
article_nodes = [node for node in frontend_data['nodes'] if node['type'] == 'article']
article_data = json.load(open('data/result/AllTheNews/network/articles.json'))
print(len(article_nodes), len(article_data))

7542 7542


In [3]:
entity_nodes = [node for node in frontend_data['nodes'] if node['type'] == 'entity']

In [4]:
print(len(entity_nodes))
entity_node_dict = { node['id']: node for node in entity_nodes }
print(entity_node_dict['Shelby Pfeffer'])

7397
{'id': 'Shelby Pfeffer', 'title': 'Shelby Pfeffer', 'entity_type': 'PERSON', 'type': 'entity', 'mentions': [{'doc_id': '108711', 'mention': 'Shelby Pfeffer'}]}


In [5]:
# read network
# AllTheNews
B = nx.node_link_graph(json.load(open('data/result/AllTheNews/network/hgraph.json')))
print(B.number_of_nodes(), B.number_of_edges())
entity_data = json.load(open('data/result/AllTheNews/network/entities.json'))
article_data = json.load(open('data/result/AllTheNews/network/articles.json'))
print(B.number_of_nodes()-len(article_data.keys()), B.number_of_edges(), len(article_data.keys()))

14939 14027
7397 14027 7542


In [6]:
H = hnx.Hypergraph.from_bipartite(B)
H.shape

(7397, 7542)

## reduce hypergraph to two-section graph with edge reweighting proposed in [1]


[1] Kumar T., Vaidyanathan S., Ananthapadmanabhan H., Parthasarathy S. and Ravindran B. “A New Measure of Modularity in Hypergraphs: Theoretical Insights and Implications for Effective Clustering”. In: Cherifi H., Gaito S., Mendes J., Moro E., Rocha L. (eds) Complex Networks and Their Applications VIII. COMPLEX NETWORKS 2019. Studies in Computational Intelligence, vol 881. Springer, Cham

In [7]:
# hyperedge_dict = json.load(open('data/result/RAMS/gpt_biHgraph_dev/hyperedges_w_embeddings.json'))
embeddings = json.load(open('data/raw/AllTheNews/embeddings/2016_10p.json'))
embeddings_dict = {embedding['id']: embedding for embedding in embeddings}

In [8]:
# print(embeddings_dict["23158"])
# print(embeddings_dict['25256'])
# print(len(embeddings_dict), len(article_data))
article_embeddings = {doc_id: embeddings_dict[doc_id]['embedding'] for doc_id, article_datum in article_data.items()}

In [14]:
list(H.nodes)

['',
 'Q868850',
 'Q317521',
 'Q1251435',
 'Jess Katz',
 'Americans for a Better Way',
 'Kathleen Purvis',
 'Q1137030',
 'Jonathan Dirlam',
 'Q49762',
 'Iraqi special forces',
 'Q6730149',
 'Q16911905',
 'Zika infection',
 'Q60604849',
 'Cryptomaster leviathan',
 'federal and state officials',
 'Ronald and Nancy Reagan',
 'Philip "Mitch" Brailsford',
 'Q1909845',
 'Q6241824',
 'Dillon Arnold',
 'Q26876',
 'Q7614572',
 'Mario Anthony Hernandez',
 'Q1515367',
 ' and lions',
 'Q1158',
 'Q6409856',
 'Q713479',
 'Q3332289',
 'Joey Bartolomeo',
 'Q21259744',
 'Marion Hedges',
 'Twitter user',
 'Q4917',
 'Q43144',
 'Q23559',
 'Ryan Manchester',
 'Q14918329',
 'Q16751186',
 'Cuban men and women',
 'Slivka',
 'Automation and robotics',
 'Q264589',
 'Russian warplanes',
 'Q35535',
 'Abdulrahman Alharbi',
 'pickup driver',
 'Q24893847',
 'average people',
 'women',
 'Gen John Kelly',
 'Erin',
 'Q6107',
 'Q6396633',
 'Q62398078',
 'Oshun Afrique',
 'Atima Omara',
 'Q58132',
 'Q18335',
 'Q3395383',

In [None]:
# clustering on hyperedges
dual_H = H.dual()
# print(dual_H.shape)

In [9]:
entity_nodes = [n for n, d in B.nodes(data=True) if d["bipartite"] == 1]
print(entity_nodes[:10])

['', 'Yuri Shipulin', 'Q3298441', 'Q312792', 'Xian Gulin', 'Q180098', 'Q22073688', 'No. 2 oil services company', 'Mississippi Church Protection Act', 'Q1394500']


In [10]:
entity_nodes = [n for n, d in B.nodes(data=True) if d["bipartite"] == 1]
entity_embeddings = {}
for entity_node in entity_nodes:
    neighbour_embeddings = np.array([embeddings_dict[hn]['embedding'] for hn in B.neighbors(entity_node)])
    embedding_mean = np.mean(neighbour_embeddings, axis=0)
    entity_embeddings[entity_node] = embedding_mean

In [11]:
H.shape

(7397, 7542)

In [12]:
# dual_H is for clustering articles, H is for clustering entities
# component_subgraphs = dual_H.s_component_subgraphs(edges=False, return_singletons=True)
component_subgraphs = H.s_component_subgraphs(edges=False, return_singletons=True)

G_ccs = ig.Graph()
weights = defaultdict(lambda: defaultdict(dict))
total = 0
event_set = set()
total_edges = 0
print("finding connected components...")
for s_component in component_subgraphs:
    print("total: ", total)
    if total > 2500: break
    total += s_component.shape[0]
    if s_component.shape[0] == 1:
        event = list(s_component.nodes())[0]
        event_name = "-".join(v['name'].split('-')[1:])
        if event_name not in event_set:
            G_ccs.add_vertices(list(s_component.nodes()))
        continue
    print("component_size: ", s_component.shape[0])
    print("reweighting...")
    cc = hmod.two_section(s_component)
    print("two_section graph size:", cc.vcount())
    index2id_dict = {}
    for v in cc.vs:
        index2id_dict[v.index] = v['name']

    deleted_vertices = []
    for v in cc.vs:
        event_name = "-".join(v['name'].split('-')[1:])
        if event_name in event_set:
            deleted_vertices.append(v['name'])
        event_set.add(event_name)

    # cc.delete_vertices(deleted_vertices)

    deleted_edges = []
    for e in cc.es:
        if index2id_dict[e.source] in deleted_vertices or index2id_dict[e.target] in deleted_vertices:
            deleted_edges.append((e.source, e.target))
    cc.delete_edges(deleted_edges)

    # edges = [(e.source, e.target, e['weight']) for e in cc.es]
    # G_ccs.add_vertices([v['name'] for v in cc.vs])
    total_edges += len(cc.es)

    # G_ccs.add_edges([(index2id_dict[e.source], index2id_dict[e.target]) for e in cc.es])
    for v in cc.vs:
        weights[v['name']][v['name']]['weight'] = 0
    if len(cc.es) != 0:
        for e in cc.es:
            weights[cc.vs[e.source]['name']][cc.vs[e.target]['name']]['weight'] = e['weight']
            weights[cc.vs[e.target]['name']][cc.vs[e.source]['name']]['weight'] = e['weight']
    # if total >= 4000:
    #     break

# print([G_cc.vcount() for G_cc in G_ccs])
# print(G_ccs.vcount())
print(total_edges, len(weights))
# GU = ig.union(G_ccs)

finding connected components...
getting s_components
getting linegraph
getting connected components
total:  0
component_size:  2527
reweighting...
two_section graph size: 2527
total:  2527
0 2527


In [13]:
G_ccs = ig.Graph.DictDict(weights)
# articles
# all_article_nodes = [node for node in dual_H.nodes()]
# largest_cc_nodes = [v['name'] for v in G_ccs.vs]
# for article_id in all_article_nodes:
#     if article_id not in largest_cc_nodes:
#         G_ccs.add_vertex(article_id)

# entities
all_entity_nodes = [node for node in H.nodes()]
largest_cc_nodes = [v['name'] for v in G_ccs.vs]
for entity_id in all_entity_nodes:
    if entity_id not in largest_cc_nodes:
        G_ccs.add_vertex(entity_id)
G_ccs.vcount()

7397

In [None]:
# print(G_ccs.vcount(), dual_H.shape)
# # add singletons back to G_ccs for clustering
# # there should be 7543 nodes in H
# singletons = [node for node in B.nodes() if B.degree(node) == 0]
# for singleton in singletons:
#     G_ccs.add_vertex(singleton)
print(len(article_data.keys()), G_ccs.vcount(), G_ccs.ecount())

In [14]:
id2index_dict = {}
for v in G_ccs.vs:
    id2index_dict[v['name']] = v.index

In [15]:
# articles
# attr_dict = {v.index: article_embeddings[v['name']] for v in G_ccs.vs}
# entities
attr_dict = {v.index: entity_embeddings[v['name']] for v in G_ccs.vs}

In [16]:
def distance_matrix(G, attr_dict):
    def dist(vec1, vec2):
        return spatial.distance.cosine(vec1, vec2)
        # return np.linalg.norm(np.array(vec1) - np.array(vec2))**2
    embeddings = np.array([attr_dict[v.index] for v in G.vs])
    return spatial.distance.cdist(embeddings, embeddings, metric='cosine')
    # create a list of list of distances
    # D = defaultdict(lambda: defaultdict(float))
    # for index1, v1 in enumerate(G.vs):
    #     for index2, v2 in enumerate(G.vs):
    #         v1_index = v1.index
    #         v2_index = v2.index
    #         embedding1 = attr_dict[v1_index]
    #         embedding2 = attr_dict[v2_index]
    #         distance = dist(embedding1, embedding2)
    #         D[v1_index][v2_index] = distance
    #         D[v2_index][v1_index] = distance
    return D

In [17]:
D = distance_matrix(G_ccs, attr_dict)

In [18]:
def ravasz(G, attr_dict, D=None):
    def weighted_degree(A):
        return {v: sum(A[v]) for v in range(0, len(A))}
    
    def weighted_common_neighbors(G, A, i, j):
        i_neighbors = G.neighbors(i)
        j_neigobors = G.neighbors(j)
        common_neighbors = list(set(i_neighbors).intersection(set(j_neigobors)))
        return sum([A[i][v] for v in common_neighbors]) + sum([A[j][v] for v in common_neighbors])


    # def weighted_TO(i, j, G, A, K, J):
    #     # J = weighted_common_neighbors(G, A, i, j)
    #     return J[i][j]/ (min(K[i], K[j]) + 1 - A[i][j])
    def weighted_TO(i, j, G, A, K):
        print(i, j)
        J = weighted_common_neighbors(G, A, i, j)
        return J/ (min(K[i], K[j]) + 1 - A[i][j])
    
    def map_max(twod_list, max_value):
        return [[min(max_value, x) for x in row] for row in twod_list]

    def twod_sum(twod_list):
        return sum([sum(row) for row in twod_list])

    def partition(G):
        P = {}
        for index, v in enumerate(G.vs):
            P[v.index] = index
        return P 

    def similarity(G, A, K, D):
        SS = 1 - D
        n = G.vcount()
        for i in range(n):
            SS[i][i] = -math.inf
        print("generating combinations")
        # combinations_indices = combinations(range(n), 2)
        row_indices = np.arange(n)
        # print("generating J")
        # print("multiplying A and A.T")
        # sparse_A = csr_matrix(np.array(A))
        # sparse_common_neighbors_matrix = sparse_A.dot(sparse_A.T)
        # dense_common_neighbors_matrix = sparse_common_neighbors_matrix.A
        # Compute the weighted common neighbors matrix
        # print("summing outer weights")
        # J = np.add.outer(dense_common_neighbors_matrix, dense_common_neighbors_matrix)
        # J = np.vectorize(weighted_common_neighbors, excluded=['G', 'A'])(
        #     row_indices[:, np.newaxis],
        #     row_indices,
        #     G=G,
        #     A=A
        # )
        print("generating CS")
        # CS = np.vectorize(weighted_TO, excluded=['G', 'A', 'K', 'J'])(
        #     row_indices[:, np.newaxis], 
        #     row_indices,
        #     G=G,
        #     A=A,
        #     K=K,
        #     J=J
        # )

        # CS = np.vectorize(weighted_TO, excluded=['G', 'A', 'K'])(
        #     row_indices[:, np.newaxis], 
        #     row_indices,
        #     G=G,
        #     A=A,
        #     K=K
        # )


        return SS
        # return (SS + CS) / 2

    def reverse_index(P):
        comms = defaultdict(list)
        for v, comm in P.items():
            comms[comm].append(v)
        renumber_dict = {}
        for index, comm in enumerate(list(comms.keys())):
            renumber_dict[comm] = index
        renumbered_comms_dict = {
            renumber_dict[comm]: vertices for comm, vertices in comms.items()
        }
        return renumbered_comms_dict

    def calculate_weights(comm1, comm2, A):
        total = 0
        for v1 in comm1:
            for v2 in comm2:
                total += A[v1][v2]
        return total

    def fusion_matrix_adjacency(A, comms):
        print("fusion matrix adjacency comms: ", len(comms))

        new_weights = defaultdict(lambda: defaultdict(dict))
        for comm1, vertices1 in comms.items():
            for comm2, vertices2 in comms.items():
                new_weights[comm1][comm2]['weight'] = calculate_weights(vertices1, vertices2, A)
                # weights[comm2][comm1]['weight'] = weights[comm1][comm2]['weight']
        clustered_G = ig.Graph.DictDict(new_weights)
        clustered_A = map_max(clustered_G.get_adjacency(attribute='weight'), 1)
        return clustered_G, clustered_A
    
    def recalculate_attr(attr_dict, comms):
        new_attr_dict = {}
        for comm, vertices in comms.items():
            avg_attr = np.mean(np.array([attr_dict[v] for v in vertices]), axis=0)
            new_attr_dict[comm] = avg_attr
        return new_attr_dict


    levels = []
    P = partition(G)
    comms_dict = reverse_index(P)
    ori_graph_partition = P
    levels = defaultdict(list)
    level = 0
    # init levels
    for v in G.vs:
        levels[v.index].append(P[v.index])
    A = map_max(G.get_adjacency(attribute='weight'), 1)
    if D is None:
        D = distance_matrix(G, attr_dict)
    while(True):
        # init level slot
        for v, cur_levels in levels.items():
            cur_levels.append(None)
        print("clustering begin")
        print("initial nodes:", G.vcount())
        print("calculating weighted_degree")
        K = weighted_degree(A)
        print("calculating similarity matrix")
        similarity_matrix = similarity(G, A, K, D)
        print("calculating reverse index of G")
        ori_graph_comms_dict = reverse_index(ori_graph_partition)
        most_similar_nodes = set()
        for v in G.vs:
            print("finding most similar node")
            # most_similar_node = max(similarity_matrix[v.index].items(), key=operator.itemgetter(1))[0]
            # most_similar_node = max(similarity_matrix[v.index], key=operator.itemgetter(1))
            most_similar_node = max(range(len(similarity_matrix[v.index])), key=similarity_matrix[v.index].__getitem__)

            print("moving node: ", v.index, " from comm: ", P[v.index], " to comm: ", P[most_similar_node])
            most_similar_nodes.add(P[most_similar_node])
            print(len(ori_graph_comms_dict), len(ori_graph_partition))
            # merge v into most_similar_node in G
            # for node in ori_graph_comms_dict[P[v.index]]:
            #     ori_graph_partition[node] = P[most_similar_node]
            #     levels[node][level] = P[most_similar_node]
            # for node in ori_graph_comms_dict[P[most_similar_node]]:
            #     ori_graph_partition[node] = P[most_similar_node]
            #     levels[node][level] = P[most_similar_node]
            # rewrite at G'
            P[v.index] = P[most_similar_node]
        for v, c in P.items():
            for node in ori_graph_comms_dict[v]:
                ori_graph_partition[node] = c
                levels[node][level] = c
        ori_graph_comms_dict = reverse_index(ori_graph_partition)
        print("most similar nodes: ", len(most_similar_nodes), len(ori_graph_comms_dict))

        level += 1
        print("one iteration done")
        comms_dict = reverse_index(P)
        print("total nodes in comms:", sum([len(x) for x in ori_graph_comms_dict.values()]))
        c_G, c_A  = fusion_matrix_adjacency(A, comms_dict)
        print("clusters: ", c_G.vcount())
        # preserve the hierarchy
        attr_dict = recalculate_attr(attr_dict, comms_dict)
        # construct new distances between clusters
        D = distance_matrix(c_G, attr_dict)
        P = partition(c_G)
        # assign the result to operate recursively
        if G.vcount() < 10 or G.vcount() == c_G.vcount(): break
        print("pass done. ")
        G = c_G
        A = c_A
    return levels


In [19]:
# levels = ravasz(G_ccs, attr_dict, D)
levels = ravasz(G_ccs, attr_dict, D)

clustering begin
initial nodes: 7397
calculating weighted_degree
calculating similarity matrix
generating combinations
generating CS
calculating reverse index of G
finding most similar node
moving node:  0  from comm:  0  to comm:  1
7397 7397
finding most similar node
moving node:  1  from comm:  1  to comm:  1
7397 7397
finding most similar node
moving node:  2  from comm:  2  to comm:  34
7397 7397
finding most similar node
moving node:  3  from comm:  3  to comm:  56
7397 7397
finding most similar node
moving node:  4  from comm:  4  to comm:  5
7397 7397
finding most similar node
moving node:  5  from comm:  5  to comm:  16
7397 7397
finding most similar node
moving node:  6  from comm:  6  to comm:  209
7397 7397
finding most similar node
moving node:  7  from comm:  7  to comm:  1131
7397 7397
finding most similar node
moving node:  8  from comm:  8  to comm:  905
7397 7397
finding most similar node
moving node:  9  from comm:  9  to comm:  1251
7397 7397
finding most similar no

In [20]:
def _renumber_dict(P):
    comm_set = set(P.values())
    renumber_dict = {comm: index for index, comm in enumerate(comm_set)}
    return renumber_dict
    # P = {v: renumber_dict[comm] for v, comm in P.items()}
    # return P
    

def levels_to_partitions(G, levels):
    partitions = []
    for v in G.vs:
        levels[v.index] = levels[v.index][0:-1]
    for level in range(len(levels[0])):
        P = {}
        for v in G.vs:
            P[v['name']] = levels[v.index][level]
        renumber_dict = _renumber_dict(P)
        P = {v: renumber_dict[comm] for v, comm in P.items()}
        for v in G.vs:
            levels[v.index][level] = P[v['name']]
        partitions.append(P)
    last_partition = partitions[-1]
    comm_labels = set(last_partition.values())
    if len(comm_labels) > 1:
        partitions.append({v['name']: 0 for v in G.vs})
        for v in G.vs:
            levels[v.index].append(0)
    return partitions, levels
partitions, renumbered_levels = levels_to_partitions(G_ccs, copy.deepcopy(levels))

In [21]:
print(len(partitions[1]))

7397


In [22]:
def save_json(data, filepath=r'new_data.json'):
    with open(filepath, 'w') as fp:
        json.dump(data, fp, indent=4)


In [24]:
def add_dummy_partition(partitions):
    first_partition = partitions[0]
    dummy_partition = {}
    for index, node_id in enumerate(list(first_partition.keys())):
        dummy_partition[node_id] = index
    partitions.insert(0, dummy_partition)
    return partitions
partitions = json.load(open('data/result/AllTheNews/network/server/ravasz_partitions_entity.json', 'r'))
print(len(partitions[0]), len(article_data))
partitions = add_dummy_partition(partitions)
save_json(partitions, 'data/result/AllTheNews/network/server/ravasz_partitions_entity.json')

7397 7542


In [27]:
def dfs(hierarchy, leaf_children_dict):
    cur_level_label = hierarchy['title'].split("-")[1]
    cur_cluster_label = hierarchy['title'].split("-")[2]
    new_level_label = str(int(cur_level_label) + 1)
    hierarchy['title'] = "L-{}-{}".format(new_level_label, cur_cluster_label)
    hierarchy['key'] = "L-{}-{}".format(new_level_label, cur_cluster_label)
    if 'children' in hierarchy:
        for child in hierarchy['children']:
            dfs(child, leaf_children_dict)
    else:
        dummy_clusters = leaf_children_dict[cur_cluster_label]
        print(dummy_clusters, cur_cluster_label)
        hierarchy['children'] = []
        for dummy_cluster_label in dummy_clusters:
            hierarchy['children'].append({ 
                "title": "L-0-{}".format(dummy_cluster_label),
                "key": "L-0-{}".format(dummy_cluster_label),
            })
    return

def add_dummy_hierarchy(partitions, hierarchies):
    first_partition = partitions[0]
    second_partition = partitions[1]
    second_level_children_dict = defaultdict(list)
    for node_id, dummy_cluster_label in first_partition.items():
        parent_cluster_label = second_partition[node_id]
        second_level_children_dict[str(parent_cluster_label)].append(dummy_cluster_label)
    print(second_level_children_dict)
    dfs(hierarchies, second_level_children_dict)
    return hierarchies
hierarchies = json.load(open("data/result/AllThenews/network/server/ravasz_hierarchies_entity.json"))
new_hierarchies = add_dummy_hierarchy(partitions, hierarchies)
save_json(new_hierarchies, "data/result/AllTheNews/network/server/ravasz_hierarchies_entity.json")

defaultdict(<class 'list'>, {'0': [0, 1, 2217], '6': [2, 34, 401, 510, 640, 642, 653, 744, 745, 751, 904, 948, 1021, 1023, 1147, 1197, 1209, 1211, 1245, 1301, 1400, 1412, 1427, 1534, 1542, 1570, 1618, 1649, 1672, 1727, 1800, 2133, 2246, 2310, 2355, 2371, 2389, 2762, 4343, 4922, 5370, 5477], '13': [3, 19, 40, 56, 103, 104, 127, 173, 185, 189, 191, 205, 208, 222, 224, 235, 250, 273, 284, 321, 331, 357, 360, 381, 392, 403, 445, 474, 475, 502, 506, 532, 577, 585, 637, 668, 699, 702, 723, 728, 729, 747, 754, 766, 769, 783, 820, 825, 862, 877, 879, 907, 919, 920, 945, 957, 959, 983, 1047, 1128, 1255, 1256, 1258, 1288, 1292, 1298, 1322, 1328, 1339, 1350, 1375, 1387, 1389, 1392, 1394, 1405, 1417, 1449, 1470, 1490, 1494, 1495, 1501, 1508, 1510, 1511, 1514, 1521, 1527, 1529, 1536, 1539, 1545, 1546, 1551, 1566, 1573, 1588, 1602, 1608, 1619, 1622, 1627, 1645, 1646, 1656, 1659, 1661, 1684, 1688, 1694, 1696, 1698, 1700, 1706, 1709, 1710, 1714, 1715, 1718, 1724, 1736, 1745, 1748, 1752, 1754, 1765, 17

In [25]:
def get_level_transition(levels):
    nested_comms = {}
    for i in range(len(levels[0])-1):
        for v, transitions in levels.items():
            trans_children_title = "L-{}-{}".format(i, transitions[i])
            trans_parent_title = "L-{}-{}".format(i+1, transitions[i+1])
            # if children is the first level
            if trans_children_title not in nested_comms:
                # create leaf
                nested_comms[trans_children_title] = {
                    "title": trans_children_title,
                    "key": trans_children_title
                }
                # add to parent 
                if trans_parent_title not in nested_comms:
                    nested_comms[trans_parent_title] = {
                        "title": trans_parent_title,
                        "key": trans_parent_title,
                        'children': [nested_comms[trans_children_title]]
                    }
                # avoid adding duplicate children
                elif trans_children_title not in [child['title'] for child in nested_comms[trans_parent_title]['children']]:
                    nested_comms[trans_parent_title]['children'].append(nested_comms[trans_children_title])
            else:
                # if children is not the first level
                # add to parent directly
                if trans_parent_title not in nested_comms:
                    nested_comms[trans_parent_title] = {
                        "title": trans_parent_title,
                        "key": trans_parent_title,
                        'children': [nested_comms[trans_children_title]]
                    }
                # avoid adding duplicate children
                elif trans_children_title not in [child['title'] for child in nested_comms[trans_parent_title]['children']]:
                    nested_comms[trans_parent_title]['children'].append(nested_comms[trans_children_title])
    final_level = len(levels[0])-1
    return nested_comms['L-{}-{}'.format(final_level, 0)]
print(renumbered_levels[500])
hierarchies = get_level_transition(renumbered_levels)

[66, 188, 8, 3, 0]


In [None]:
def save_json(data, filepath=r'new_data.json'):
   with open(filepath, 'w') as fp:
      json.dump(data, fp, indent=4)

In [None]:
print(partitions[1])
save_json(partitions, "data/result/AllTheNews/network/ravasz_partitions_entity.json")
# save_json(partitions, "data/result/RAMS/gpt_biHgraph_dev/ravasz_partitions.json")

In [26]:
hierarchies
save_json(hierarchies, "data/result/AllThenews/network/server/ravasz_hierarchies_entity.json")
# save_json(hierarchies, "data/result/RAMS/gpt_biHgraph_dev/ravasz_hierarchies.json")

In [None]:
frontend_data = json.load(open("data/result/AllTheNews/network/server/frontend_2.json"))
entity_nodes = [node for node in frontend_data['nodes'] if node['type'] == 'entity']
print(len(entity_nodes))

In [None]:
partitions = json.load(open("data/result/AllTheNews/network/server/ravasz_partitions_entity.json"))
print(len(partitions[0]))

In [None]:
filtered_article_nodes = [article_node for article_node in article_nodes if article_node['id'] in partitions[0].keys()]
len(filtered_article_nodes)

In [None]:
links = frontend_data['links']
filtered_article_node_ids = [article_node['id'] for article_node in filtered_article_nodes]
filtered_links = [link for link in links if link['source'] in filtered_article_node_ids or link['target'] in filtered_article_node_ids]
print(len(filtered_links), len(links))
filtered_sources = [link['source'] for link in filtered_links]
filtered_targets = [link['target'] for link in filtered_links]
entity_nodes = [node for node in frontend_data['nodes'] if node['type'] == 'entity']
filtered_entities = [entity_node for entity_node in entity_nodes if entity_node['id'] in filtered_sources or entity_node['id'] in filtered_targets]
print(len(filtered_entities), len(entity_nodes))
frontend_data['nodes'] = filtered_article_nodes + filtered_entities
frontend_data['links'] = filtered_links
save_json(frontend_data, "data/result/AllTheNews/network/server/frontend_2.json")