In [1]:
import glob
import os
import shutil
import igraph
import numpy as np
from collections import Counter
import spacy
import re
from tqdm import tqdm

In [2]:
project = "elasticsearch"
filepath = glob.glob(f"../../data/projects/arcanOutput/{project}/dep-graph-*.graphml")[0]

graph = igraph.Graph.Read_GraphML(filepath)
print(len(graph.es))
print(len(graph.vs))

delete = [x.index for x in graph.vs if "$" in x['name']]
graph.delete_vertices(delete)
graph.es.select(labelE='isChildOf').delete()
graph.es.select(labelE='isImplementationOf').delete()
graph.es.select(labelE='nestedTo').delete()
graph.es.select(labelE='belongsTo').delete()
graph.es.select(labelE='implementedBy').delete()
graph.es.select(labelE='definedBy').delete()

graph.vs.select(_degree=0).delete()
print(len(graph.es))
print(len(graph.vs))

54143
6774
30945
3901


In [3]:
communities_info = graph.community_infomap()

In [4]:
len(set(communities_info.membership))


249

In [8]:
import leidenalg

communities_leiden = leidenalg.find_partition(graph, leidenalg.ModularityVertexPartition)
print(communities_leiden.quality())

graph.vs['infomap'] = communities_info.membership
graph.vs['leiden'] = communities_leiden.membership

0.535410350341681


In [11]:
#graph.write_graphml(f"{project}_communities.graphml")

In [12]:
visual_style = {}
layout = graph.layout_fruchterman_reingold()
visual_style["vertex_size"] = 20
visual_style["vertex_label"] = graph.vs["name"]
graph.es["label"] = graph.es["labelE"]

visual_style["layout"] = layout
visual_style["bbox"] = (10000, 10000)
visual_style["margin"] = 20

In [13]:
def plot_graph(graph, method, visual_style):
    pal = igraph.drawing.colors.ClusterColoringPalette(len(set(graph.vs[method])))
    graph.vs['color'] = pal.get_many(graph.vs[method])
    print(os.path.join(*[method, project, "community.pdf"]))
    igraph.plot(graph, os.path.join(*[method, project, "community.pdf"]), **visual_style)

In [14]:
def folder(method, project):
    if not os.path.exists(method):
        os.makedirs(method)

    project_path = os.path.join(method, project)
    if not os.path.exists(project_path):
        #shutil.rmtree(project_path)

        os.makedirs(project_path)

In [16]:
method = "infomap"
folder(method, project)
plot_graph(graph, method, visual_style)

infomap/elasticsearch/community.pdf


In [17]:
method = "leiden"
folder(method, project)
plot_graph(graph, method, visual_style)#%%


print(set(graph.es["labelE"]))

leiden/elasticsearch/community.pdf
{'unitIsAfferentOf', 'dependsOn', 'containerIsAfferentOf'}


In [18]:
def extract_community_dependency(graph, method):
    counter = Counter()
    total = 0
    connected = set()
    for edge in graph.es:
        source_vertex = graph.vs[edge.source]
        target_vertex = graph.vs[edge.target]
        source_community = source_vertex[method]
        target_community = target_vertex[method]
        if source_community != target_community:
            total += 1
            counter[(source_community, target_community)] += 1
            connected.add(source_community)
            connected.add(target_community)

        # using get_eid() you can do the opposite:
        #same_edge_id = graph.get_eid(source_vertex_id, target_vertex_id)
        #same_edge = graph.es[same_edge_id]
        # by .index you get the id from the Vertex or Edge object:
        #source_vertex.index == source_vertex_id
        # True
        #edge.index == same_edge_id
        # True
    return counter, total, connected


In [20]:
method = "infomap"
counter, total = extract_community_dependency(graph, method)
n_count = [(x, y/total) for x, y in counter.most_common(len(set(graph.vs[method])))]
print(total)
print(counter.most_common(100))
print(n_count)

17742
[((32, 15), 522), ((22, 18), 307), ((32, 13), 222), ((30, 15), 201), ((22, 15), 199), ((22, 30), 195), ((20, 15), 190), ((32, 20), 177), ((32, 81), 169), ((53, 15), 166), ((32, 69), 160), ((22, 40), 155), ((32, 53), 141), ((49, 32), 139), ((69, 32), 139), ((18, 15), 134), ((20, 13), 134), ((18, 22), 132), ((49, 13), 130), ((15, 13), 122), ((49, 15), 119), ((34, 15), 112), ((53, 32), 111), ((77, 32), 105), ((69, 15), 100), ((49, 39), 100), ((32, 30), 97), ((30, 22), 95), ((32, 71), 94), ((39, 15), 91), ((49, 47), 88), ((77, 15), 86), ((24, 15), 86), ((20, 32), 83), ((32, 105), 83), ((3, 13), 83), ((34, 32), 81), ((32, 47), 78), ((30, 14), 77), ((69, 13), 67), ((30, 13), 66), ((53, 13), 66), ((49, 34), 65), ((30, 32), 65), ((52, 15), 63), ((18, 13), 62), ((24, 32), 61), ((49, 11), 61), ((30, 20), 61), ((30, 18), 61), ((9, 15), 57), ((49, 52), 57), ((14, 13), 56), ((13, 3), 56), ((0, 3), 55), ((22, 13), 54), ((32, 11), 53), ((32, 39), 53), ((17, 30), 52), ((10, 16), 51), ((65, 18), 

In [21]:
method = "leiden"
counter, total = extract_community_dependency(graph, method)
n_count = [(x, y/total) for x, y in counter.most_common(len(set(graph.vs[method])))]
print(total)
print(counter.most_common(100))
print(n_count)

9607
[((1, 4), 1352), ((3, 4), 665), ((3, 1), 634), ((1, 3), 571), ((4, 1), 557), ((2, 5), 445), ((5, 4), 437), ((2, 4), 384), ((0, 3), 247), ((5, 2), 211), ((6, 4), 210), ((6, 1), 203), ((3, 0), 202), ((5, 1), 186), ((6, 5), 155), ((0, 4), 153), ((4, 5), 142), ((1, 5), 133), ((9, 1), 115), ((3, 5), 109), ((0, 1), 106), ((4, 3), 103), ((9, 4), 99), ((1, 0), 92), ((7, 1), 88), ((2, 1), 84), ((6, 2), 82), ((1, 6), 80), ((3, 9), 80), ((1, 7), 78), ((5, 3), 77), ((1, 9), 74), ((3, 6), 72), ((5, 6), 71), ((6, 3), 71), ((2, 0), 69), ((4, 0), 68), ((1, 2), 61), ((7, 4), 60), ((0, 5), 59), ((5, 9), 58), ((5, 0), 56), ((4, 2), 52), ((10, 1), 50), ((10, 4), 45), ((2, 6), 43), ((0, 2), 41), ((2, 3), 41), ((4, 9), 40), ((9, 2), 35), ((5, 7), 35), ((0, 9), 34), ((11, 4), 32), ((6, 0), 28), ((9, 3), 24), ((0, 6), 23), ((4, 6), 22), ((9, 0), 22), ((10, 5), 19), ((7, 5), 17), ((4, 7), 14), ((10, 3), 14), ((3, 10), 14), ((1, 8), 13), ((11, 5), 12), ((1, 10), 11), ((2, 9), 10), ((6, 9), 9), ((3, 2), 9),

In [22]:
def get_intersection_matrix(comm_members_1, comm_members_2):
    clusters_number = len(set(comm_members_1.values()))
    ground_clusters_number = len(set(comm_members_2.values()))

    clusters_members = {}
    for pedestrian_id in comm_members_1:
        cluster_number = comm_members_1[pedestrian_id]
        clusters_members[cluster_number] = clusters_members[cluster_number] + [
            pedestrian_id] if cluster_number in clusters_members else [pedestrian_id]

    ground_clusters_members = {}
    for pedestrian_id in comm_members_2:
        cluster_number = comm_members_2[pedestrian_id]
        ground_clusters_members[cluster_number] = ground_clusters_members[cluster_number] + [
            pedestrian_id] if cluster_number in ground_clusters_members else [pedestrian_id]

    intersection_count = np.zeros(shape=(ground_clusters_number, clusters_number))

    for ground_key in ground_clusters_members:
        for cluster_key in clusters_members:
            a = ground_clusters_members[ground_key]
            b = clusters_members[cluster_key]
            intersection = list(set(a) & set(b))
            intersection_count[ground_key, cluster_key] = len(intersection)

    return intersection_count

def equivalence_classes(confusion_matrix):
    return [(i, v) for i, v in enumerate(np.argmax(confusion_matrix, axis=0))]

In [23]:
comm_info_dict = {k: v for k, v in enumerate(communities_info.membership)}
comm_leiden_dict = {k: v for k, v in enumerate(communities_leiden.membership)}


In [25]:
int_matrix = get_intersection_matrix(comm_info_dict, comm_leiden_dict)
equiv_classes = equivalence_classes(int_matrix)
print(equiv_classes)

[(0, 0), (1, 2), (2, 1), (3, 3), (4, 4), (5, 5), (6, 4), (7, 6), (8, 8), (9, 7), (10, 0), (11, 9), (12, 11), (13, 7), (14, 0)]


In [26]:
def get_subgraph(graph:igraph.Graph, method):
    for i in set(graph.vs[method]):
        ids = [x.index for x in graph.vs if x[method] == i]
        comm = graph.subgraph(ids)
        yield comm
        #comm.write_graphml(os.path.join(project_path, f"comm_{i}.graphml"))

subgraphs = list(get_subgraph(graph, "leiden"))

In [27]:
nlp = spacy.load("en_trf_bertbaseuncased_lg")
def split_camel(name):
    splitted = re.sub('([A-Z][a-z]+)|_', r' \1', re.sub('([A-Z]+)', r' \1', name)).split()
    return splitted

def name_to_sentence(name):
    tokens = name.split(".")[2:]
    clean = []
    for token in tokens:
        clean.extend(split_camel(token))

    return name, " ".join(clean).lower()


OSError: [E050] Can't find model 'en_trf_bertbaseuncased_lg'. It doesn't seem to be a shortcut link, a Python package or a valid path to a data directory.

In [64]:
# import scipy.sparse as sparse
#
# def get_coo_adj_list(subgraph):
#     source_vertices = []
#     target_vertices = []
#     for edge in subgraph.es:
#         source_vertex_id = edge.source
#         target_vertex_id = edge.target
#
#         source_vertices.append(source_vertex_id)
#         target_vertices.append(target_vertex_id)
#
#     coo = sparse.csr_matrix([source_vertices, target_vertices]).tocoo()
#     return coo

In [65]:
def get_embeddings(subgraph):
    for node in subgraph.vs:
        name = node['name']
        name, clean = name_to_sentence(name)
        # print(name, clean)
        if not clean:
            clean = node['name']
        embedding = nlp(clean).vector
        yield name, clean, embedding

In [66]:
project_path = os.path.join(method, project)

for i, subgraph in enumerate(subgraphs):
    subgraph.write_graphml(os.path.join(project_path, f"comm_{i}.graphml"))
    layout = subgraph.layout_fruchterman_reingold()
    visual_style["layout"] = layout
    igraph.plot(subgraph, os.path.join(project_path, f"comm_{i}.pdf"), **visual_style)
    # coo = get_coo_adj_list(subgraph)

    # sparse.save_npz(f"{project}_{method}_comm_{i}_coo.npz", coo)
    print(len(subgraph.gs), len(subgraph.es))
    embeddings = get_embeddings(subgraph)
    with open(os.path.join(project_path, f"comm_{i}.vec"), "wt", encoding="utf8") as outf:
        for name, cleanded, embedding in tqdm(embeddings):
            rep = " ".join([str(x) for x in embedding.tolist()])
            line = name +" " + rep + "\n"
            outf.write(line)
            # print(name, embedding, len(embedding))

798it [00:28, 27.95it/s]
644it [00:22, 28.30it/s]
614it [00:22, 27.24it/s]
519it [00:19, 27.07it/s]
437it [00:15, 28.22it/s]
284it [00:09, 28.59it/s]
184it [00:06, 29.09it/s]
153it [00:05, 29.10it/s]
110it [00:03, 28.91it/s]
104it [00:03, 28.52it/s]
54it [00:01, 29.84it/s]


In [67]:
import os
os.getcwd()

'/home/sasce/PycharmProjects/SemanticGraphEmbedding/semanticGraphEmbedding/community'

In [81]:
from sgnn.dataset import *
dataset = DependencyCommunityDataset("/home/sasce/PycharmProjects/SemanticGraphEmbedding/semanticGraphEmbedding/community")

tensor([[ 0,  0,  1,  ..., 94, 94, 94],
        [72, 81,  2,  ..., 31, 47, 50]])
tensor([[ 0,  1,  1,  2,  2,  3,  3,  3,  3,  3,  4,  4,  4,  4,  4,  4,  5,  5,
          5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  6,  7,  7,  7,  7,  7,  7,
          7,  7,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8, 10, 10, 10,
         10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11,
         11, 11, 12, 12, 12, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16,
         16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21,
         21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 23,
         23, 23, 23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
         24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
         24, 24, 24, 24, 24, 24, 24, 25, 25, 25, 25, 26, 26, 26, 26, 27, 27, 27,
         27, 28, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
         29, 30, 30, 30, 30,