In [3]:
import networkx as nx
import numpy as np
from scipy import spatial
from scipy.sparse import csr_matrix
import json
from collections import defaultdict
from itertools import combinations
import copy
import math
import glob

In [7]:
# All The News
summarized_embeddings = json.load(open('summary_embedding.json'))
summarized_ids = [article['id'] for article in summarized_embeddings]
full_text_embeddings = [json.load(open(article_filename)) for article_filename in glob.glob('full_text_embedding/*.json') if article_filename.split('/')[-1].split('.')[0].split("_")[-1] in summarized_ids]
print(len(summarized_embeddings), len(full_text_embeddings))

7638 7638


In [37]:
def transform(embeddings):
    idx_to_id_dict = {}
    embeddings_dict = {}
    G = nx.Graph()
    for index, article in enumerate(embeddings):
        idx_to_id_dict[index] = article['id']
        embeddings_dict[index] = article['embedding']
        G.add_node(index)
    return idx_to_id_dict, embeddings_dict, G
sum_idx_to_id_dict, sum_embeddings_dict, G_sum = transform(summarized_embeddings)
full_idx_to_id_dict, full_embeddings_dict, G_full = transform(full_text_embeddings)



In [38]:
print(len(G_sum.nodes))
print(len(G_full.nodes))

7638
7638


In [39]:
# articles
sum_attr_dict = sum_embeddings_dict
full_attr_dict = full_embeddings_dict
# entities

In [40]:
embeddings = np.array([full_attr_dict[v] for v in G_full.nodes])

In [41]:
def distance_matrix(G, attr_dict):
    def dist(vec1, vec2):
        return spatial.distance.cosine(vec1, vec2)
        # return np.linalg.norm(np.array(vec1) - np.array(vec2))**2
    # embeddings = np.array([attr_dict[v.index] for v in G.vs])
    embeddings = np.array([attr_dict[v] for v in G.nodes])
    return spatial.distance.cdist(embeddings, embeddings, metric='cosine')
    # create a list of list of distances
    # D = defaultdict(lambda: defaultdict(float))
    # for index1, v1 in enumerate(G.vs):
    #     for index2, v2 in enumerate(G.vs):
    #         v1_index = v1.index
    #         v2_index = v2.index
    #         embedding1 = attr_dict[v1_index]
    #         embedding2 = attr_dict[v2_index]
    #         distance = dist(embedding1, embedding2)
    #         D[v1_index][v2_index] = distance
    #         D[v2_index][v1_index] = distance
    return D

In [42]:
sum_D = distance_matrix(G_sum, sum_attr_dict)
full_D = distance_matrix(G_full, full_attr_dict)

In [56]:
def ravasz(G, attr_dict, D=None):
    def partition(G):
        P = {}
        for v in G.nodes:
            P[v] = v
        return P 
    def similarity(G, D):
        SS = 1 - D
        for i in range(len(G.nodes)):
            SS[i][i] = -math.inf
        return SS

    def reverse_index(P):
        comms = defaultdict(list)
        for v, comm in P.items():
            comms[comm].append(v)
        renumber_dict = {}
        for index, comm in enumerate(list(comms.keys())):
            renumber_dict[comm] = index
        renumbered_comms_dict = {
            renumber_dict[comm]: vertices for comm, vertices in comms.items()
        }
        return renumbered_comms_dict

    def calculate_weights(comm1, comm2, A):
        total = 0
        for v1 in comm1:
            for v2 in comm2:
                total += A[v1][v2]
        return total
    
    def recalculate_attr(attr_dict, comms):
        new_attr_dict = {}
        for comm, vertices in comms.items():
            avg_attr = np.mean(np.array([attr_dict[v] for v in vertices]), axis=0)
            new_attr_dict[comm] = avg_attr
        return new_attr_dict


    levels = []
    P = partition(G)
    comms_dict = reverse_index(P)
    ori_graph_partition = P
    levels = defaultdict(list)
    level = 0
    # init levels
    # for v in G.vs:
        # levels[v.index].append(P[v.index])
    for v in G.nodes:
        levels[v].append(P[v])
    if D is None:
        D = distance_matrix(G, attr_dict)
    while(True):
        # init level slot
        for v, cur_levels in levels.items():
            cur_levels.append(None)
        print("clustering begin")
        # print("initial nos:", G.vcount())
        print("initial nos:", len(G.nodes))
        print("calculating weighted_degree")
        print("calculating similarity matrix")
        similarity_matrix = similarity(G, D)

        print("calculating reverse index of G")
        ori_graph_comms_dict = reverse_index(ori_graph_partition)
        most_similar_nodes = set()
        # for v in G.vs:
        for v in G.nodes:
            print("finding most similar node")
            # most_similar_node = max(range(len(similarity_matrix[v.index])), key=similarity_matrix[v.index].__getitem__)
            most_similar_node = max(range(len(similarity_matrix[v])), key=similarity_matrix[v].__getitem__)

            # print("moving node: ", v.index, " from comm: ", P[v.index], " to comm: ", P[most_similar_node])
            print("moving node: ", v, " from comm: ", P[v], " to comm: ", P[most_similar_node])
            most_similar_nodes.add(P[most_similar_node])
            print(len(ori_graph_comms_dict), len(ori_graph_partition))
            P[v] = P[most_similar_node]
        for v, c in P.items():
            for node in ori_graph_comms_dict[v]:
                ori_graph_partition[node] = c
                levels[node][level] = c
        ori_graph_comms_dict = reverse_index(ori_graph_partition)
        print("most similar nodes: ", len(most_similar_nodes), len(ori_graph_comms_dict))

        level += 1
        print("one iteration done")
        comms_dict = reverse_index(P)
        print("total nodes in comms:", sum([len(x) for x in ori_graph_comms_dict.values()]))
        c_G = nx.Graph()
        c_G.add_nodes_from(list(comms_dict.keys()))
        print("clusters: ", len(c_G.nodes))
        # preserve the hierarchy
        attr_dict = recalculate_attr(attr_dict, comms_dict)
        # construct new distances between clusters
        D = distance_matrix(c_G, attr_dict)
        P = partition(c_G)
        # assign the result to operate recursively
        if len(G.nodes) < 10 or len(G.nodes) == len(c_G.nodes): break
        print("pass done. ")
        G = c_G
    return levels


In [57]:
# levels = ravasz(G_ccs, attr_dict, D)
sum_levels = ravasz(G_sum, sum_attr_dict, sum_D)

clustering begin
initial nos: 7638
calculating weighted_degree
calculating similarity matrix
calculating reverse index of G
finding most similar node
moving node:  0  from comm:  0  to comm:  141
7638 7638
finding most similar node
moving node:  1  from comm:  1  to comm:  3257
7638 7638
finding most similar node
moving node:  2  from comm:  2  to comm:  6085
7638 7638
finding most similar node
moving node:  3  from comm:  3  to comm:  6144
7638 7638
finding most similar node
moving node:  4  from comm:  4  to comm:  7436
7638 7638
finding most similar node
moving node:  5  from comm:  5  to comm:  2683
7638 7638
finding most similar node
moving node:  6  from comm:  6  to comm:  2848
7638 7638
finding most similar node
moving node:  7  from comm:  7  to comm:  2143
7638 7638
finding most similar node
moving node:  8  from comm:  8  to comm:  5240
7638 7638
finding most similar node
moving node:  9  from comm:  9  to comm:  5628
7638 7638
finding most similar node
moving node:  10  fro

In [58]:
full_levels = ravasz(G_full, full_attr_dict, full_D)

clustering begin
initial nos: 7638
calculating weighted_degree
calculating similarity matrix
calculating reverse index of G
finding most similar node
moving node:  0  from comm:  0  to comm:  84
7638 7638
finding most similar node
moving node:  1  from comm:  1  to comm:  1168
7638 7638
finding most similar node
moving node:  2  from comm:  2  to comm:  6785
7638 7638
finding most similar node
moving node:  3  from comm:  3  to comm:  5984
7638 7638
finding most similar node
moving node:  4  from comm:  4  to comm:  1576
7638 7638
finding most similar node
moving node:  5  from comm:  5  to comm:  3295
7638 7638
finding most similar node
moving node:  6  from comm:  6  to comm:  4653
7638 7638
finding most similar node
moving node:  7  from comm:  7  to comm:  7339
7638 7638
finding most similar node
moving node:  8  from comm:  8  to comm:  1033
7638 7638
finding most similar node
moving node:  9  from comm:  9  to comm:  4660
7638 7638
finding most similar node
moving node:  10  from

In [60]:
def _renumber_dict(P):
    comm_set = set(P.values())
    renumber_dict = {comm: index for index, comm in enumerate(comm_set)}
    return renumber_dict
    # P = {v: renumber_dict[comm] for v, comm in P.items()}
    # return P
    

def levels_to_partitions(G, levels, idx_dict):
    partitions = []
    # for v in G.vs:
    #     levels[v.index] = levels[v.index][0:-1]
    for v in G.nodes:
        levels[v] = levels[v][0:-1]
    for level in range(len(levels[0])):
        P = {}
        # for v in G.vs:
            # P[v['name']] = levels[v.index][level]
        for v in G.nodes:
            P[idx_dict[v]] = levels[v][level]
        renumber_dict = _renumber_dict(P)
        P = {v: renumber_dict[comm] for v, comm in P.items()}
        # for v in G.vs:
        #     levels[v.index][level] = P[v['name']]
        for v in G.nodes:
            levels[v][level] = P[idx_dict[v]]
        partitions.append(P)
    last_partition = partitions[-1]
    comm_labels = set(last_partition.values())
    if len(comm_labels) > 1:
        # partitions.append({v['name']: 0 for v in G.vs})
        partitions.append({idx_dict[v]: 0 for v in G.nodes})
        for v in G.vs:
            levels[v.index].append(0)
    return partitions, levels
# partitions, renumbered_levels = levels_to_partitions(G_ccs, copy.deepcopy(levels))
sum_partitions, sum_renumbered_levels = levels_to_partitions(G_sum, copy.deepcopy(sum_levels), sum_idx_to_id_dict)
full_partitions, full_renumbered_levels = levels_to_partitions(G_full, copy.deepcopy(full_levels), full_idx_to_id_dict)

In [63]:
def save_json(data, filepath=r'new_data.json'):
    with open(filepath, 'w') as fp:
        json.dump(data, fp, indent=4)

In [64]:
print(len(sum_partitions[1]))
save_json(sum_partitions, 'sum_partitions.json')
save_json(full_partitions, 'full_partitions.json')

7638


In [65]:
def add_dummy_partition(partitions):
    first_partition = partitions[0]
    dummy_partition = {}
    for index, node_id in enumerate(list(first_partition.keys())):
        dummy_partition[node_id] = index
    partitions.insert(0, dummy_partition)
    return partitions
# partitions = json.load(open('data/result/AllTheNews/network/server/ravasz_partitions_article.json', 'r'))
# partitions = json.load(open('data/result/AllTheNews/network/server/ravasz_partitions_entity.json', 'r'))
# partitions = json.load(open('data/result/VisPub/network/server/ravasz_partitions_article.json', 'r'))
sum_partitions = json.load(open('sum_partitions_raw.json', 'r'))
full_partitions = json.load(open('full_partitions_raw.json', 'r'))
sum_partitions = add_dummy_partition(sum_partitions)
full_partitions = add_dummy_partition(full_partitions)
# save_json(partitions, 'data/result/AllTheNews/network/server/ravasz_partitions_article.json')
# save_json(partitions, 'data/result/AllTheNews/network/server/ravasz_partitions_entity.json')
# save_json(partitions, 'data/result/VisPub/network/server/ravasz_partitions_article.json')
save_json(sum_partitions, 'sum_partitions.json')
save_json(full_partitions, 'full_partitions.json')

In [66]:
def get_level_transition(levels):
    nested_comms = {}
    for i in range(len(levels[0])-1):
        for v, transitions in levels.items():
            trans_children_title = "L-{}-{}".format(i, transitions[i])
            trans_parent_title = "L-{}-{}".format(i+1, transitions[i+1])
            # if children is the first level
            if trans_children_title not in nested_comms:
                # create leaf
                nested_comms[trans_children_title] = {
                    "title": trans_children_title,
                    "key": trans_children_title
                }
                # add to parent 
                if trans_parent_title not in nested_comms:
                    nested_comms[trans_parent_title] = {
                        "title": trans_parent_title,
                        "key": trans_parent_title,
                        'children': [nested_comms[trans_children_title]]
                    }
                # avoid adding duplicate children
                elif trans_children_title not in [child['title'] for child in nested_comms[trans_parent_title]['children']]:
                    nested_comms[trans_parent_title]['children'].append(nested_comms[trans_children_title])
            else:
                # if children is not the first level
                # add to parent directly
                if trans_parent_title not in nested_comms:
                    nested_comms[trans_parent_title] = {
                        "title": trans_parent_title,
                        "key": trans_parent_title,
                        'children': [nested_comms[trans_children_title]]
                    }
                # avoid adding duplicate children
                elif trans_children_title not in [child['title'] for child in nested_comms[trans_parent_title]['children']]:
                    nested_comms[trans_parent_title]['children'].append(nested_comms[trans_children_title])
    final_level = len(levels[0])-1
    return nested_comms['L-{}-{}'.format(final_level, 0)]
sum_hierarchies = get_level_transition(sum_renumbered_levels)
full_hierarchies = get_level_transition(full_renumbered_levels)
save_json(sum_hierarchies, 'sum_hierarchy_raw.json')
save_json(full_hierarchies, 'full_hierarchy_raw.json')

In [67]:
def dfs(hierarchy, leaf_children_dict):
    cur_level_label = hierarchy['title'].split("-")[1]
    cur_cluster_label = hierarchy['title'].split("-")[2]
    new_level_label = str(int(cur_level_label) + 1)
    hierarchy['title'] = "L-{}-{}".format(new_level_label, cur_cluster_label)
    hierarchy['key'] = "L-{}-{}".format(new_level_label, cur_cluster_label)
    if 'children' in hierarchy:
        for child in hierarchy['children']:
            dfs(child, leaf_children_dict)
    else:
        dummy_clusters = leaf_children_dict[cur_cluster_label]
        print(dummy_clusters, cur_cluster_label)
        hierarchy['children'] = []
        for dummy_cluster_label in dummy_clusters:
            hierarchy['children'].append({ 
                "title": "L-0-{}".format(dummy_cluster_label),
                "key": "L-0-{}".format(dummy_cluster_label),
            })
    return

def add_dummy_hierarchy(partitions, hierarchies):
    first_partition = partitions[0]
    second_partition = partitions[1]
    second_level_children_dict = defaultdict(list)
    for node_id, dummy_cluster_label in first_partition.items():
        parent_cluster_label = second_partition[node_id]
        second_level_children_dict[str(parent_cluster_label)].append(dummy_cluster_label)
    print(second_level_children_dict)
    dfs(hierarchies, second_level_children_dict)
    return hierarchies
# hierarchies = json.load(open("data/result/AllThenews/network/server/ravasz_hierarchies_entity.json"))
# hierarchies = json.load(open("data/result/VisPub/network/server/ravasz_hierarchies_article.json"))
# hierarchies = json.load(open("data/result/VisPub/network/server/ravasz_hierarchies_entity.json"))
sum_hierarchies = json.load(open('sum_hierarchy_raw.json', 'r'))
full_hierarchies = json.load(open('full_hierarchy_raw.json', 'r'))
new_sum_hierarchies = add_dummy_hierarchy(sum_partitions, sum_hierarchies)
new_full_hierarchies = add_dummy_hierarchy(full_partitions, full_hierarchies)
save_json(new_sum_hierarchies, 'sum_hierarchies.json')
save_json(new_full_hierarchies, 'full_hierarchies.json')

defaultdict(<class 'list'>, {'0': [0, 141, 2274, 5745], '602': [1, 1896], '1731': [2, 3626, 6085], '1761': [3, 594, 1118, 1335, 2030, 2758, 2807, 2842, 2891, 4424, 4844, 5030, 5414, 5624, 6144, 7279], '2478': [4, 286, 1509, 3899, 4775, 5521, 7436], '425': [5], '467': [6, 2848, 3736], '314': [7, 1127, 1864, 2054, 2143], '1354': [8, 3082, 4180, 4425, 4517, 4928, 5240, 6034], '1529': [9], '2032': [10, 3690, 5543], '2277': [11, 6484], '1996': [12, 1805, 2334, 4971, 6485, 6487, 6597, 7424], '433': [13, 1163, 2316, 2723, 2768], '1448': [14, 4661], '1144': [15, 4739, 5314, 6668, 7003], '786': [16], '1630': [17, 71, 179, 4941, 5537, 5619, 5706, 5855], '2460': [18, 1830, 5487, 7409], '339': [19, 21, 136, 557], '436': [20, 3815, 4822, 7199], '249': [22, 442, 1821, 1910, 2789], '1900': [23, 4677, 5323, 6444, 6658, 6679, 7286, 7631], '35': [24, 705, 829, 3729, 5813, 6026], '2218': [25, 1756, 5903, 6970, 7008], '603': [26, 574, 2092, 3262, 3482, 6121, 6931, 7552], '1838': [27, 225, 1214, 1231, 1924

In [None]:
import json
import copy
def flatten_hierarchy(hierarchy):
    queue = copy.deepcopy(hierarchy['children'])
    hierarchy_flattened = {}
    while(len(queue) > 0):
        cur = queue[0]
        hierarchy_flattened[cur['key']] = {
            "key": cur['key'],
            "title": cur['title'],
        }
        if 'children' in cur:
            queue += cur['children']
            children_keys = list(map(lambda child: child['key'], cur['children']))
            hierarchy_flattened[cur['key']]["children"] = children_keys
        queue = queue[1:]
    return hierarchy_flattened
entity_hierarchy = json.load(open('data/result/VisPub/network/server/ravasz_hierarchies_entity.json'))
hierarchy_flattened_entity = flatten_hierarchy(entity_hierarchy)

In [9]:
import json
from collections import defaultdict
from pprint import pprint
full_partitions = json.load(open('full/full_partitions.json'))
sum_partitions = json.load(open('summary/sum_partitions.json'))
def reverse_index(partition):
    comms = defaultdict(list)
    for v, comm in partition.items():
        comms[comm].append(v)
    return comms
full_comms = reverse_index(full_partitions[1])
sum_comms = reverse_index(sum_partitions[1])
pprint(full_comms)

defaultdict(<class 'list'>,
            {0: ['157882'],
             1: ['118313'],
             2: ['216564', '191646', '81520', '30136'],
             3: ['111466'],
             4: ['71638', '202233', '56205', '57880'],
             5: ['143880', '138818', '36904', '137891'],
             6: ['80206'],
             7: ['22535', '59295', '55690'],
             8: ['191072',
                 '191361',
                 '215903',
                 '157363',
                 '58427',
                 '173596',
                 '80981',
                 '137953',
                 '136753',
                 '72266',
                 '215862',
                 '216058',
                 '36482',
                 '49832',
                 '86349',
                 '122299',
                 '83700'],
             9: ['41349'],
             10: ['156552'],
             11: ['80093', '58261'],
             12: ['110810', '111094'],
             13: ['96940'],
             14: ['140306', '192351

In [7]:
pprint(sum_comms)

defaultdict(<class 'list'>,
            {0: ['25256', '25164', '72307', '169969'],
             1: ['21146', '24190'],
             2: ['25197',
                 '25914',
                 '21188',
                 '134477',
                 '144310',
                 '174266',
                 '175223',
                 '210042'],
             3: ['22113', '136346', '200891', '216096'],
             4: ['24968', '172922', '171345'],
             5: ['25751', '97332'],
             6: ['22838',
                 '17292',
                 '22453',
                 '121936',
                 '169009',
                 '170806',
                 '167166',
                 '194634'],
             7: ['22039', '20464'],
             8: ['20604'],
             9: ['26451', '135828'],
             10: ['35926', '39645'],
             11: ['23264'],
             12: ['36820', '43353', '45682'],
             13: ['20318', '42983', '48554'],
             14: ['23196', '36981', '56313'],
          