In [1]:
import networkx as nx
import hypernetx as hnx
import numpy as np
from scipy import spatial
from scipy.sparse import csr_matrix

import json
import hypernetx.algorithms.hypergraph_modularity as hmod
import igraph as ig
from collections import defaultdict
from itertools import combinations

import itertools
import copy
import time
import sys
import operator
import math

In [None]:
# All The News
frontend_data = json.load(open('data/result/AllTheNews/network/server/frontend.json'))
article_nodes = [node for node in frontend_data['nodes'] if node['type'] == 'article']
article_data = json.load(open('data/result/AllTheNews/network/articles.json'))
print(len(article_nodes), len(article_data))

In [2]:
# VisPub
frontend_data = json.load(open('data/result/VisPub/network/server/frontend.json'))
article_nodes = [node for node in frontend_data['nodes'] if node['type'] == 'article']
article_data = json.load(open('data/result/VisPub/network/articles.json'))
print(len(article_nodes), len(article_data))

2632 2632


In [None]:
entity_nodes = [node for node in frontend_data['nodes'] if node['type'] == 'entity']

In [None]:
print(len(entity_nodes))
entity_node_dict = { node['id']: node for node in entity_nodes }

In [None]:
# read network
# AllTheNews
B = nx.node_link_graph(json.load(open('data/result/AllTheNews/network/hgraph.json')))
print(B.number_of_nodes(), B.number_of_edges())
entity_data = json.load(open('data/result/AllTheNews/network/entities.json'))
article_data = json.load(open('data/result/AllTheNews/network/articles.json'))
print(B.number_of_nodes()-len(article_data.keys()), B.number_of_edges(), len(article_data.keys()))

In [3]:
# VisPub
B = nx.node_link_graph(json.load(open('data/result/VisPub/network/hgraph.json')))
print(B.number_of_nodes(), B.number_of_edges())
entity_data = json.load(open('data/result/VisPub/network/entities.json'))
article_data = json.load(open('data/result/VisPub/network/articles.json'))
print(B.number_of_nodes()-len(article_data.keys()), B.number_of_edges(), len(article_data.keys()))

7733 11775
5101 11775 2632


In [None]:
print(B.nodes(data=True))
for n, d in B.nodes(data=True):
    print(d['bipartite'])

In [4]:
H = hnx.Hypergraph.from_bipartite(B)
H.shape

(5101, 2632)

## reduce hypergraph to two-section graph with edge reweighting proposed in [1]


[1] Kumar T., Vaidyanathan S., Ananthapadmanabhan H., Parthasarathy S. and Ravindran B. “A New Measure of Modularity in Hypergraphs: Theoretical Insights and Implications for Effective Clustering”. In: Cherifi H., Gaito S., Mendes J., Moro E., Rocha L. (eds) Complex Networks and Their Applications VIII. COMPLEX NETWORKS 2019. Studies in Computational Intelligence, vol 881. Springer, Cham

In [6]:
# entity_data = json.load(open('data/result/AllTheNews/network/entities_w_description_embedding.json'))
# entity_data = json.load(open('data/result/VisPub/entity_embeddings.json'))
entity_data = json.load(open('data/result/VisPub/network/server/entity_explanations_embeddings.json'))
article_data = json.load(open('data/result/VisPub/network/server/article_embeddings.json'))
entity_embeddings = {entity_id: entity['embedding'] for entity_id, entity in entity_data.items()}
article_embeddings = {article['doc_id']: article['embedding'] for article in article_data}

In [None]:
# print(embeddings_dict["23158"])
# print(embeddings_dict['25256'])
# print(len(embeddings_dict), len(article_data))
# article_embeddings = {doc_id: embeddings_dict[doc_id]['embedding'] for doc_id, article_datum in article_data.items()}

In [7]:
# clustering on hyperedges
dual_H = H.dual()
# print(dual_H.shape)

In [None]:
H.shape

In [8]:
# dual_H is for clustering articles, H is for clustering entities
component_subgraphs = dual_H.s_component_subgraphs(edges=False, return_singletons=True)
# component_subgraphs = H.s_component_subgraphs(edges=False, return_singletons=True)

G_ccs = ig.Graph()
weights = defaultdict(lambda: defaultdict(dict))
total = 0
event_set = set()
total_edges = 0
print("finding connected components...")
for s_component in component_subgraphs:
    print("total: ", total)
    if total > 2500: break
    total += s_component.shape[0]
    if s_component.shape[0] == 1:
        event = list(s_component.nodes())[0]
        event_name = "-".join(v['name'].split('-')[1:])
        if event_name not in event_set:
            G_ccs.add_vertices(list(s_component.nodes()))
        continue
    print("component_size: ", s_component.shape[0])
    print("reweighting...")
    cc = hmod.two_section(s_component)
    print("two_section graph size:", cc.vcount())
    index2id_dict = {}
    for v in cc.vs:
        index2id_dict[v.index] = v['name']

    deleted_vertices = []
    for v in cc.vs:
        event_name = "-".join(v['name'].split('-')[1:])
        if event_name in event_set:
            deleted_vertices.append(v['name'])
        event_set.add(event_name)

    # cc.delete_vertices(deleted_vertices)

    deleted_edges = []
    for e in cc.es:
        if index2id_dict[e.source] in deleted_vertices or index2id_dict[e.target] in deleted_vertices:
            deleted_edges.append((e.source, e.target))
    cc.delete_edges(deleted_edges)

    # edges = [(e.source, e.target, e['weight']) for e in cc.es]
    # G_ccs.add_vertices([v['name'] for v in cc.vs])
    total_edges += len(cc.es)

    # G_ccs.add_edges([(index2id_dict[e.source], index2id_dict[e.target]) for e in cc.es])
    for v in cc.vs:
        weights[v['name']][v['name']]['weight'] = 0
    if len(cc.es) != 0:
        for e in cc.es:
            weights[cc.vs[e.source]['name']][cc.vs[e.target]['name']]['weight'] = e['weight']
            weights[cc.vs[e.target]['name']][cc.vs[e.source]['name']]['weight'] = e['weight']
    # if total >= 4000:
    #     break

# print([G_cc.vcount() for G_cc in G_ccs])
# print(G_ccs.vcount())
print(total_edges, len(weights))
# GU = ig.union(G_ccs)

finding connected components...
total:  0
component_size:  2482
reweighting...
two_section graph size: 2482
total:  2482
component_size:  2
reweighting...
two_section graph size: 2
total:  2484
component_size:  2
reweighting...
two_section graph size: 2
total:  2486
component_size:  2
reweighting...
two_section graph size: 2
total:  2488
total:  2489
total:  2490
total:  2491
total:  2492
total:  2493
total:  2494
total:  2495
total:  2496
total:  2497
total:  2498
total:  2499
total:  2500
total:  2501
0 2488


In [9]:
G_ccs = ig.Graph.DictDict(weights)
# articles
all_article_nodes = [node for node in dual_H.nodes()]
largest_cc_nodes = [v['name'] for v in G_ccs.vs]
for article_id in all_article_nodes:
    if article_id not in largest_cc_nodes:
        G_ccs.add_vertex(article_id)

# entities
# all_entity_nodes = [node for node in H.nodes()]
# largest_cc_nodes = [v['name'] for v in G_ccs.vs]
# for entity_id in all_entity_nodes:
#     if entity_id not in largest_cc_nodes:
#         G_ccs.add_vertex(entity_id)
G_ccs.vcount()

2632

In [None]:
# print(G_ccs.vcount(), dual_H.shape)
# # add singletons back to G_ccs for clustering
# # there should be 7543 nodes in H
# singletons = [node for node in B.nodes() if B.degree(node) == 0]
# for singleton in singletons:
#     G_ccs.add_vertex(singleton)
print(len(article_data.keys()), G_ccs.vcount(), G_ccs.ecount())

In [None]:
# id2index_dict = {}
# for v in G_ccs.vs:
#     id2index_dict[v['name']] = v.index

In [10]:
# articles
attr_dict = {v.index: article_embeddings[v['name']] for v in G_ccs.vs}
# entities
# attr_dict = {v.index: entity_embeddings[v['name']] for v in G_ccs.vs}

In [11]:
def distance_matrix(G, attr_dict):
    def dist(vec1, vec2):
        return spatial.distance.cosine(vec1, vec2)
        # return np.linalg.norm(np.array(vec1) - np.array(vec2))**2
    embeddings = np.array([attr_dict[v.index] for v in G.vs])
    return spatial.distance.cdist(embeddings, embeddings, metric='cosine')
    # create a list of list of distances
    # D = defaultdict(lambda: defaultdict(float))
    # for index1, v1 in enumerate(G.vs):
    #     for index2, v2 in enumerate(G.vs):
    #         v1_index = v1.index
    #         v2_index = v2.index
    #         embedding1 = attr_dict[v1_index]
    #         embedding2 = attr_dict[v2_index]
    #         distance = dist(embedding1, embedding2)
    #         D[v1_index][v2_index] = distance
    #         D[v2_index][v1_index] = distance
    return D

In [12]:
D = distance_matrix(G_ccs, attr_dict)

In [13]:
def ravasz(G, attr_dict, D=None):
    def weighted_degree(A):
        return {v: sum(A[v]) for v in range(0, len(A))}
    
    def weighted_common_neighbors(G, A, i, j):
        i_neighbors = G.neighbors(i)
        j_neigobors = G.neighbors(j)
        common_neighbors = list(set(i_neighbors).intersection(set(j_neigobors)))
        return sum([A[i][v] for v in common_neighbors]) + sum([A[j][v] for v in common_neighbors])


    # def weighted_TO(i, j, G, A, K, J):
    #     # J = weighted_common_neighbors(G, A, i, j)
    #     return J[i][j]/ (min(K[i], K[j]) + 1 - A[i][j])
    def weighted_TO(i, j, G, A, K):
        print(i, j)
        J = weighted_common_neighbors(G, A, i, j)
        return J/ (min(K[i], K[j]) + 1 - A[i][j])
    
    def map_max(twod_list, max_value):
        return [[min(max_value, x) for x in row] for row in twod_list]

    def twod_sum(twod_list):
        return sum([sum(row) for row in twod_list])

    def partition(G):
        P = {}
        for index, v in enumerate(G.vs):
            P[v.index] = index
        return P 

    def similarity(G, A, K, D):
        SS = 1 - D
        n = G.vcount()
        for i in range(n):
            SS[i][i] = -math.inf
        print("generating combinations")
        # combinations_indices = combinations(range(n), 2)
        row_indices = np.arange(n)
        # print("generating J")
        # print("multiplying A and A.T")
        # sparse_A = csr_matrix(np.array(A))
        # sparse_common_neighbors_matrix = sparse_A.dot(sparse_A.T)
        # dense_common_neighbors_matrix = sparse_common_neighbors_matrix.A
        # Compute the weighted common neighbors matrix
        # print("summing outer weights")
        # J = np.add.outer(dense_common_neighbors_matrix, dense_common_neighbors_matrix)
        # J = np.vectorize(weighted_common_neighbors, excluded=['G', 'A'])(
        #     row_indices[:, np.newaxis],
        #     row_indices,
        #     G=G,
        #     A=A
        # )
        print("generating CS")
        # CS = np.vectorize(weighted_TO, excluded=['G', 'A', 'K', 'J'])(
        #     row_indices[:, np.newaxis], 
        #     row_indices,
        #     G=G,
        #     A=A,
        #     K=K,
        #     J=J
        # )

        # CS = np.vectorize(weighted_TO, excluded=['G', 'A', 'K'])(
        #     row_indices[:, np.newaxis], 
        #     row_indices,
        #     G=G,
        #     A=A,
        #     K=K
        # )


        return SS
        # return (SS + CS) / 2

    def reverse_index(P):
        comms = defaultdict(list)
        for v, comm in P.items():
            comms[comm].append(v)
        renumber_dict = {}
        for index, comm in enumerate(list(comms.keys())):
            renumber_dict[comm] = index
        renumbered_comms_dict = {
            renumber_dict[comm]: vertices for comm, vertices in comms.items()
        }
        return renumbered_comms_dict

    def calculate_weights(comm1, comm2, A):
        total = 0
        for v1 in comm1:
            for v2 in comm2:
                total += A[v1][v2]
        return total

    def fusion_matrix_adjacency(A, comms):
        print("fusion matrix adjacency comms: ", len(comms))

        new_weights = defaultdict(lambda: defaultdict(dict))
        for comm1, vertices1 in comms.items():
            for comm2, vertices2 in comms.items():
                new_weights[comm1][comm2]['weight'] = calculate_weights(vertices1, vertices2, A)
                # weights[comm2][comm1]['weight'] = weights[comm1][comm2]['weight']
        clustered_G = ig.Graph.DictDict(new_weights)
        clustered_A = map_max(clustered_G.get_adjacency(attribute='weight'), 1)
        return clustered_G, clustered_A
    
    def recalculate_attr(attr_dict, comms):
        new_attr_dict = {}
        for comm, vertices in comms.items():
            avg_attr = np.mean(np.array([attr_dict[v] for v in vertices]), axis=0)
            new_attr_dict[comm] = avg_attr
        return new_attr_dict


    levels = []
    P = partition(G)
    comms_dict = reverse_index(P)
    ori_graph_partition = P
    levels = defaultdict(list)
    level = 0
    # init levels
    for v in G.vs:
        levels[v.index].append(P[v.index])
    A = map_max(G.get_adjacency(attribute='weight'), 1)
    if D is None:
        D = distance_matrix(G, attr_dict)
    while(True):
        # init level slot
        for v, cur_levels in levels.items():
            cur_levels.append(None)
        print("clustering begin")
        print("initial nodes:", G.vcount())
        print("calculating weighted_degree")
        K = weighted_degree(A)
        print("calculating similarity matrix")
        similarity_matrix = similarity(G, A, K, D)
        print("calculating reverse index of G")
        ori_graph_comms_dict = reverse_index(ori_graph_partition)
        most_similar_nodes = set()
        for v in G.vs:
            print("finding most similar node")
            # most_similar_node = max(similarity_matrix[v.index].items(), key=operator.itemgetter(1))[0]
            # most_similar_node = max(similarity_matrix[v.index], key=operator.itemgetter(1))
            most_similar_node = max(range(len(similarity_matrix[v.index])), key=similarity_matrix[v.index].__getitem__)

            print("moving node: ", v.index, " from comm: ", P[v.index], " to comm: ", P[most_similar_node])
            most_similar_nodes.add(P[most_similar_node])
            print(len(ori_graph_comms_dict), len(ori_graph_partition))
            # merge v into most_similar_node in G
            # for node in ori_graph_comms_dict[P[v.index]]:
            #     ori_graph_partition[node] = P[most_similar_node]
            #     levels[node][level] = P[most_similar_node]
            # for node in ori_graph_comms_dict[P[most_similar_node]]:
            #     ori_graph_partition[node] = P[most_similar_node]
            #     levels[node][level] = P[most_similar_node]
            # rewrite at G'
            P[v.index] = P[most_similar_node]
        for v, c in P.items():
            for node in ori_graph_comms_dict[v]:
                ori_graph_partition[node] = c
                levels[node][level] = c
        ori_graph_comms_dict = reverse_index(ori_graph_partition)
        print("most similar nodes: ", len(most_similar_nodes), len(ori_graph_comms_dict))

        level += 1
        print("one iteration done")
        comms_dict = reverse_index(P)
        print("total nodes in comms:", sum([len(x) for x in ori_graph_comms_dict.values()]))
        c_G, c_A  = fusion_matrix_adjacency(A, comms_dict)
        print("clusters: ", c_G.vcount())
        # preserve the hierarchy
        attr_dict = recalculate_attr(attr_dict, comms_dict)
        # construct new distances between clusters
        D = distance_matrix(c_G, attr_dict)
        P = partition(c_G)
        # assign the result to operate recursively
        if G.vcount() < 10 or G.vcount() == c_G.vcount(): break
        print("pass done. ")
        G = c_G
        A = c_A
    return levels


In [14]:
# levels = ravasz(G_ccs, attr_dict, D)
levels = ravasz(G_ccs, attr_dict, D)

clustering begin
initial nodes: 2632
calculating weighted_degree
calculating similarity matrix
generating combinations
generating CS
calculating reverse index of G
finding most similar node
moving node:  0  from comm:  0  to comm:  2462
2632 2632
finding most similar node
moving node:  1  from comm:  1  to comm:  1110
2632 2632
finding most similar node
moving node:  2  from comm:  2  to comm:  926
2632 2632
finding most similar node
moving node:  3  from comm:  3  to comm:  151
2632 2632
finding most similar node
moving node:  4  from comm:  4  to comm:  157
2632 2632
finding most similar node
moving node:  5  from comm:  5  to comm:  843
2632 2632
finding most similar node
moving node:  6  from comm:  6  to comm:  2494
2632 2632
finding most similar node
moving node:  7  from comm:  7  to comm:  8
2632 2632
finding most similar node
moving node:  8  from comm:  8  to comm:  8
2632 2632
finding most similar node
moving node:  9  from comm:  9  to comm:  340
2632 2632
finding most simi

In [18]:
def _renumber_dict(P):
    comm_set = set(P.values())
    renumber_dict = {comm: index for index, comm in enumerate(comm_set)}
    return renumber_dict
    # P = {v: renumber_dict[comm] for v, comm in P.items()}
    # return P
    

def levels_to_partitions(G, levels):
    partitions = []
    for v in G.vs:
        levels[v.index] = levels[v.index][0:-1]
    for level in range(len(levels[0])):
        P = {}
        for v in G.vs:
            P[v['name']] = levels[v.index][level]
        renumber_dict = _renumber_dict(P)
        P = {v: renumber_dict[comm] for v, comm in P.items()}
        for v in G.vs:
            levels[v.index][level] = P[v['name']]
        partitions.append(P)
    last_partition = partitions[-1]
    comm_labels = set(last_partition.values())
    if len(comm_labels) > 1:
        partitions.append({v['name']: 0 for v in G.vs})
        for v in G.vs:
            levels[v.index].append(0)
    return partitions, levels
partitions, renumbered_levels = levels_to_partitions(G_ccs, copy.deepcopy(levels))

In [19]:
print(len(partitions[1]))

5101


In [16]:
def save_json(data, filepath=r'new_data.json'):
    with open(filepath, 'w') as fp:
        json.dump(data, fp, indent=4)


In [19]:
save_json(partitions, 'data/result/VisPub/network/server/ravasz_partitions_article.json')
# save_json(partitions, 'data/result/VisPub/network/server/ravasz_partitions_entity.json')

In [28]:
def add_dummy_partition(partitions):
    first_partition = partitions[0]
    dummy_partition = {}
    for index, node_id in enumerate(list(first_partition.keys())):
        dummy_partition[node_id] = index
    partitions.insert(0, dummy_partition)
    return partitions
# partitions = json.load(open('data/result/AllTheNews/network/server/ravasz_partitions_article.json', 'r'))
# partitions = json.load(open('data/result/AllTheNews/network/server/ravasz_partitions_entity.json', 'r'))
partitions = json.load(open('data/result/VisPub/network/server/ravasz_partitions_article.json', 'r'))
partitions = add_dummy_partition(partitions)
save_json(partitions, 'data/result/VisPub/network/server/ravasz_partitions_article.json')

# partitions = json.load(open('data/result/VisPub/network/server/ravasz_partitions_entity.json', 'r'))
# partitions = add_dummy_partition(partitions)
# save_json(partitions, 'data/result/VisPub/network/server/ravasz_partitions_entity.json')

In [23]:
def get_level_transition(levels):
    nested_comms = {}
    for i in range(len(levels[0])-1):
        for v, transitions in levels.items():
            trans_children_title = "L-{}-{}".format(i, transitions[i])
            trans_parent_title = "L-{}-{}".format(i+1, transitions[i+1])
            # if children is the first level
            if trans_children_title not in nested_comms:
                # create leaf
                nested_comms[trans_children_title] = {
                    "title": trans_children_title,
                    "key": trans_children_title
                }
                # add to parent 
                if trans_parent_title not in nested_comms:
                    nested_comms[trans_parent_title] = {
                        "title": trans_parent_title,
                        "key": trans_parent_title,
                        'children': [nested_comms[trans_children_title]]
                    }
                # avoid adding duplicate children
                elif trans_children_title not in [child['title'] for child in nested_comms[trans_parent_title]['children']]:
                    nested_comms[trans_parent_title]['children'].append(nested_comms[trans_children_title])
            else:
                # if children is not the first level
                # add to parent directly
                if trans_parent_title not in nested_comms:
                    nested_comms[trans_parent_title] = {
                        "title": trans_parent_title,
                        "key": trans_parent_title,
                        'children': [nested_comms[trans_children_title]]
                    }
                # avoid adding duplicate children
                elif trans_children_title not in [child['title'] for child in nested_comms[trans_parent_title]['children']]:
                    nested_comms[trans_parent_title]['children'].append(nested_comms[trans_children_title])
    final_level = len(levels[0])-1
    return nested_comms['L-{}-{}'.format(final_level, 0)]
print(renumbered_levels[500])
hierarchies = get_level_transition(renumbered_levels)
save_json(hierarchies, "data/result/VisPub/network/server/ravasz_hierarchies_article.json")
# save_json(hierarchies, "data/result/VisPub/network/server/ravasz_hierarchies_entity.json")

[384, 15, 16, 1, 1, 0]


In [24]:
def dfs(hierarchy, leaf_children_dict):
    cur_level_label = hierarchy['title'].split("-")[1]
    cur_cluster_label = hierarchy['title'].split("-")[2]
    new_level_label = str(int(cur_level_label) + 1)
    hierarchy['title'] = "L-{}-{}".format(new_level_label, cur_cluster_label)
    hierarchy['key'] = "L-{}-{}".format(new_level_label, cur_cluster_label)
    if 'children' in hierarchy:
        for child in hierarchy['children']:
            dfs(child, leaf_children_dict)
    else:
        dummy_clusters = leaf_children_dict[cur_cluster_label]
        print(dummy_clusters, cur_cluster_label)
        hierarchy['children'] = []
        for dummy_cluster_label in dummy_clusters:
            hierarchy['children'].append({ 
                "title": "L-0-{}".format(dummy_cluster_label),
                "key": "L-0-{}".format(dummy_cluster_label),
            })
    return

def add_dummy_hierarchy(partitions, hierarchies):
    first_partition = partitions[0]
    second_partition = partitions[1]
    second_level_children_dict = defaultdict(list)
    for node_id, dummy_cluster_label in first_partition.items():
        parent_cluster_label = second_partition[node_id]
        second_level_children_dict[str(parent_cluster_label)].append(dummy_cluster_label)
    print(second_level_children_dict)
    dfs(hierarchies, second_level_children_dict)
    return hierarchies
# hierarchies = json.load(open("data/result/AllThenews/network/server/ravasz_hierarchies_entity.json"))
# hierarchies = json.load(open("data/result/VisPub/network/server/ravasz_hierarchies_article.json"))
# save_json(new_hierarchies, "data/result/AllTheNews/network/server/ravasz_hierarchies_entity.json")

hierarchies = json.load(open("data/result/VisPub/network/server/ravasz_hierarchies_article.json"))
new_hierarchies = add_dummy_hierarchy(partitions, hierarchies)
save_json(new_hierarchies, "data/result/VisPub/network/server/ravasz_hierarchies_article.json")

# hierarchies = json.load(open("data/result/VisPub/network/server/ravasz_hierarchies_entity.json"))
# new_hierarchies = add_dummy_hierarchy(partitions, hierarchies)
# save_json(new_hierarchies, "data/result/VisPub/network/server/ravasz_hierarchies_entity.json")

defaultdict(<class 'list'>, {'246': [0], '506': [1, 1110, 1725, 1798, 2546], '453': [2, 749], '80': [3, 122, 151], '86': [4, 157], '436': [5, 367, 843], '271': [6, 2494], '4': [7, 8, 1339, 1382, 2329, 2394], '203': [9], '26': [10, 36, 47, 67, 79, 127, 149, 822, 1326, 1364, 1375, 1384, 1388, 1536, 1595, 2346, 2355, 2386, 2392, 2616], '378': [11], '810': [12, 1914], '852': [13, 618, 1492, 2000], '507': [14, 24, 1115, 1118, 1508], '181': [15, 2628], '643': [16, 848], '60': [17, 688], '184': [18, 320, 731, 1151, 1250, 1410, 1476, 1481, 1500], '346': [19, 554], '511': [20, 658], '293': [21, 272, 273, 276, 485, 534, 540, 983, 1356, 1757, 2404, 2405, 2458, 2539], '14': [22], '85': [23, 967, 2204], '451': [25, 916], '205': [26, 343, 2085], '558': [27, 1193, 1264, 1558], '281': [28], '719': [29, 1417], '776': [30, 2216], '109': [31, 1873, 2187, 2244], '380': [32, 247, 633, 1970], '18': [33, 34, 750], '91': [35, 1015, 1114], '598': [37, 1391], '32': [38, 54], '420': [39, 632, 774, 930], '245': [

In [None]:
print(partitions[1])
save_json(partitions, "data/result/AllTheNews/network/ravasz_partitions_entity.json")
# save_json(partitions, "data/result/RAMS/gpt_biHgraph_dev/ravasz_partitions.json")

In [None]:
hierarchies
# save_json(hierarchies, "data/result/VisPub/network/server/ravasz_hierarchies_article.json")
save_json(hierarchies, "data/result/VisPub/network/server/ravasz_hierarchies_entity.json")
# save_json(hierarchies, "data/result/AllThenews/network/server/ravasz_hierarchies_entity.json")
# save_json(hierarchies, "data/result/RAMS/gpt_biHgraph_dev/ravasz_hierarchies.json")

In [None]:
frontend_data = json.load(open("data/result/AllTheNews/network/server/frontend_2.json"))
entity_nodes = [node for node in frontend_data['nodes'] if node['type'] == 'entity']
print(len(entity_nodes))

In [None]:
partitions = json.load(open("data/result/AllTheNews/network/server/ravasz_partitions_entity.json"))
print(len(partitions[0]))

In [None]:
filtered_article_nodes = [article_node for article_node in article_nodes if article_node['id'] in partitions[0].keys()]
len(filtered_article_nodes)

In [None]:
links = frontend_data['links']
filtered_article_node_ids = [article_node['id'] for article_node in filtered_article_nodes]
filtered_links = [link for link in links if link['source'] in filtered_article_node_ids or link['target'] in filtered_article_node_ids]
print(len(filtered_links), len(links))
filtered_sources = [link['source'] for link in filtered_links]
filtered_targets = [link['target'] for link in filtered_links]
entity_nodes = [node for node in frontend_data['nodes'] if node['type'] == 'entity']
filtered_entities = [entity_node for entity_node in entity_nodes if entity_node['id'] in filtered_sources or entity_node['id'] in filtered_targets]
print(len(filtered_entities), len(entity_nodes))
frontend_data['nodes'] = filtered_article_nodes + filtered_entities
frontend_data['links'] = filtered_links
save_json(frontend_data, "data/result/AllTheNews/network/server/frontend_2.json")

In [None]:
import requests
import json
def query_wikidata(Qid):
    # access_token = 'eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJhdWQiOiJhZWVlNGYzOWU2NWQ4ZjBmZDkxNmE0ZWNkYTFlM2NiNyIsImp0aSI6ImI2M2M4ODE2MWNjNGFmZDFmY2NjODNhNDlhNDA3ZDljZTU3YzVmY2VlYjkyZmRjMmIyNjBkN2YwZTZkMDJlMTQzOWJhZDE0ZTZlZWUzNGU0IiwiaWF0IjoxNjkzMjY1MTM5Ljk2NzQ3NiwibmJmIjoxNjkzMjY1MTM5Ljk2NzQ3OSwiZXhwIjozMzI1MDE3MzkzOS45NjI3MDgsInN1YiI6IjczNjMxNjQxIiwiaXNzIjoiaHR0cHM6Ly9tZXRhLndpa2ltZWRpYS5vcmciLCJyYXRlbGltaXQiOnsicmVxdWVzdHNfcGVyX3VuaXQiOjUwMDAsInVuaXQiOiJIT1VSIn0sInNjb3BlcyI6WyJiYXNpYyIsImhpZ2h2b2x1bWUiLCJvYXRoIl19.LubWKzBdnyXWQGo__lGIUKK7R-YpYRk2m2XGs0sUwBaTQtYz0nAhvSnjfN1xEngmgIyBorbzgRxfj3MW2KVRXnQlMrr8KYxN0uVV72ZllpAUae2x7FMJz7DBUb1F2iZfPCO1F1rgN6BqWTrI26dSb3j61HEh687V5smkPE9CJGOOBopWNGT3CAOUr1I9Y8Eac_dHbidAjXxnOGah1h62ikggdY1zXFqUvqeNwyjrMEkM8Cne0FaPiQvtsL05hiWnXDi5NEkEJz6zMuoklzH6IhnTio9L7cz4p47AagZ-25-OjT9llaHP2aJdNwpXLHHUe1owF-7odHWWAUqX4BOwsdU0uJAyIfFrLODyV0S0viJDry_kKJg3_dUu4s_6QYq6v1W22vpqZi1W0GdjaxQukHuw5pEVzx4UQiT1D1fCQxzQ92a6yOWwyqZGN7Lxnxl8i3CgUiJ3kGpVHiJ2U4kBXdR6XnpYt-W_tGLnfmY1CoIMR-YP-IzqOaIGpYEOvjq57UQV09A_193B1LWHslGpXw-j1xOGj_IeMpttY8_VPeFvsPdBJiEonnPWOPfV5bYR5bR8vdKhBSBpqdixs3wFXNaQDQuMc8rGrCamCO8cDVKblrnM0hHKtvKYniYq5S3aUQVkefpRQJW6h73CGvz1Lj5xnsgHBCtwYp6kgF0MrqI'
    access_token = open('wikidata_access_token').read()
    base = 'https://www.wikidata.org/w/rest.php/wikibase/v0'
    url = base + '/entities/items/{}/descriptions'.format(Qid)
    # url = base + '/entities/items/Q42/statements'
    headers = {
        'Content-Type': 'application/json',
        'Authorization': access_token
    }
    res = requests.get(url, headers=headers)
    res = res.json()
    return res['en']


In [None]:
entity_dict = json.load(open('data/result/AllTheNews/network/entities.json'))
for entity_id, entity_data in entity_dict.items():
    if entity_data['id'] != entity_data['title']:
        Qid = entity_data['id']
        try:
            description = query_wikidata(Qid)
        except:
            description = entity_data['title']
        entity_data['description'] = description
    else:
        entity_data['description'] = entity_data['title']
save_json(entity_dict, 'data/result/AllTheNews/network/entities_w_description.json')

In [None]:
import json
import copy
def flatten_hierarchy(hierarchy):
    queue = copy.deepcopy(hierarchy['children'])
    hierarchy_flattened = {}
    while(len(queue) > 0):
        cur = queue[0]
        hierarchy_flattened[cur['key']] = {
            "key": cur['key'],
            "title": cur['title'],
        }
        if 'children' in cur:
            queue += cur['children']
            children_keys = list(map(lambda child: child['key'], cur['children']))
            hierarchy_flattened[cur['key']]["children"] = children_keys
        queue = queue[1:]
    return hierarchy_flattened
entity_hierarchy = json.load(open('data/result/VisPub/network/server/ravasz_hierarchies_entity.json'))
hierarchy_flattened_entity = flatten_hierarchy(entity_hierarchy)

In [None]:
hierarchy_flattened_entity

In [None]:
partitions_entity = json.load(open('data/result/VisPub/network/server/ravasz_partitions_entity.json'))
level = 2 
level_partition = partitions_entity[level]
cluster_labels = list(set(list(level_partition.values())))
print(len(cluster_labels))

In [None]:
from pprint import pprint
pprint(level_partition)

In [None]:
from collections import defaultdict
hierarchy_flattened = hierarchy_flattened_entity
def getSubClusterLabels(cluster_label):
    cluster_label = "L-{}-{}".format(level, cluster_label)
    if 'children' not in hierarchy_flattened[cluster_label]: 
        return [cluster_label]
    else:
        sub_cluster_labels = hierarchy_flattened[cluster_label]['children']
        while len(sub_cluster_labels) == 1 and 'children' in hierarchy_flattened[sub_cluster_labels[0]]:
            sub_cluster_labels = hierarchy_flattened[sub_cluster_labels[0]]['children']
        return sub_cluster_labels

def _binPartitions(partition, level):
    clusters = defaultdict(list)
    print(len(partition))
    for node_id, cluster_label in partition.items():
        full_cluster_label = "L-{level}-{cluster_label}".format(level=level, cluster_label=cluster_label)
        clusters[full_cluster_label].append(node_id)
    return clusters

for cluster_label in cluster_labels:
    sub_cluster_labels = getSubClusterLabels(cluster_label)
    sub_cluster_level = int(sub_cluster_labels[0].split("-")[1])
    partition = partitions_entity[sub_cluster_level]
    all_sub_cluster_at_level = _binPartitions(partition, sub_cluster_level)
    print(all_sub_cluster_at_level)
    print(cluster_label, sub_cluster_labels)
    for sub_cluster_label in sub_cluster_labels:
        print(sub_cluster_label, all_sub_cluster_at_level[sub_cluster_label])