In [1]:
from pm4py.objects.log.importer.xes import importer as xes_importer
from pm4py.algo.discovery.dfg import algorithm as dfg_discovery
from clustering_utils import  build_cdlib_graph, build_ground_truth, get_ordered_communities, get_events_count
from utils import pre_processing_xes, discard_events_in_multiple_activities
import pm4py
import networkx as nx

In [2]:
# # LOADING DATA
# # event log not partitioned (single trace with a list of events)
# event_log = xes_importer.apply('BP-Meets-IoT2022/sim_22d1p/EventLogXES_fixed_wc.xes') # ('sim_22d1p_22/EventLogXESNoSegment.xes')
# # event log partitioned (list of named traces)
# event_log_labelled = xes_importer.apply('BP-Meets-IoT2022/sim_22d1p/EventLogXES_fixed_wc.xes') # ('sim_22d1p_22/EventLogXES.xes')

In [None]:
# LOADING DATA
# event log not partitioned (single trace with a list of events)
event_log = xes_importer.apply('BP-Meets-IoT2020/d21p1/EventLogXESNoSegment.xes') # ('sim_22d1p_22/EventLogXESNoSegment.xes')
# event log partitioned (list of named traces)
event_log_labelled = xes_importer.apply('BP-Meets-IoT2020/d21p1/EventLogXES.xes') # ('sim_22d1p_22/EventLogXES.xes')

In [4]:
# Pre-processing: removing noises
event_log, event_log_labelled = pre_processing_xes(event_log, event_log_labelled)

In [5]:
from utils import convert_to_dataframe, convert_to_event_log
from clustering_utils import get_events_count

In [8]:
# #events_count
# df_event_log = convert_to_dataframe(event_log)
# df_event_log = df_event_log.sort_values(by=["time:timestamp", "eventId"])
# event_log = convert_to_event_log(df_event_log)
# 
# df_event_log_labelled = convert_to_dataframe(event_log_labelled)
# df_event_log_labelled = df_event_log_labelled.sort_values(by=["time:timestamp", "eventId"])
# event_log_labelled = convert_to_event_log(df_event_log_labelled)

In [9]:
# Build DFG with PM4PY
dfg_graph = dfg_discovery.apply(event_log)

In [10]:
G, nodes, nameIndex = build_cdlib_graph(dfg_graph)

# Ground truth
The ground truth is represented by the communities present in the log file labeled with the names of the activities.


In [13]:
ground_truth, ground_truth_dictionary =  build_ground_truth(event_log_labelled, nameIndex, G)


In [14]:
def dump_comm_to_names(new_dict):
    print("*** Communities to names")
    for item in new_dict:
         print(f"Community [{item['id']}]\t{item['len']}\n\t{item['evt']}\t")
    print("*******\n")

In [15]:
def comm_to_names(communities, nodes, sort_events=False):
    new_dict = get_ordered_communities(communities, nodes, sort_events)
    dump_comm_to_names(new_dict)

Dump the ground truth of events names

In [16]:
comms_truth =[]
idx = 0
for community_label in ground_truth_dictionary:
    #print(f"{idx} - {community_label}")
    comms_truth.append(list(ground_truth_dictionary[community_label]))
    idx = idx + 1
    
#comm_to_names(comms_truth, nodes, sort_events=True)

In [17]:
comm_to_names(comms_truth, nodes, sort_events=False)


[0, 1, 2, 3, 4, 5, 6, 7, 8, 66]
[9, 10, 11, 12, 13]
[14, 15, 16, 17, 24]
[0, 8, 18, 19, 20, 21, 22, 23]
[25, 26]
[4, 27, 28, 29, 9, 30, 31, 32, 11, 33]
[34, 35, 36, 37, 41]
[38, 39, 40, 4, 27]
[42, 43, 44]
[45, 46]
[42, 47, 48]
[28, 49, 9, 50, 51, 52, 31, 53, 54, 55, 11, 33]
[0, 8, 18, 19, 56, 57, 22, 9, 58]
[11, 59]
[0, 8, 18, 19, 56, 60, 22, 23]
[61, 62]
[4, 27, 28, 63, 64, 65, 31, 53, 11, 33]
*** Communities to names
Community [12]	12
	go_fridge	get_ingredients_from_fridge	go_kitchen_shelf	get_ingredients_from_shelf	go_oven	use_oven	go_dining_table	eat_warm_meal	pack_food	put_meal_to_fridge	go_kitchen_sink	put_plate_to_sink	
Community [1]	10
	go_wardrobe	get_clothes	go_bathtub	have_bath	go_bathroom_sink	brush_teeth	go_bed	sleep_in_bed	change_clothes	go_to start	
Community [6]	10
	go_bathroom_sink	wash_hands	go_fridge	get_food_from_fridge	go_kitchen_shelf	get_bread	go_dining_table	eat_cold_meal	go_kitchen_sink	put_plate_to_sink	
Community [17]	10
	go_bathroom_sink	wash_hands	go_fridg

# Algorithms evaluation
Often it make sense to execute a given CD algorithm multiple times, varying its parameters, so to identify the optimal configuration w.r.t. a given fitness score.
We now experiment several algorithms: louvain, leiden, infomap, ...

In [18]:
from cdlib import algorithms
from cdlib import evaluation
from cdlib import ensemble

Note: to be able to use all crisp methods, you need to install some additional packages:  {'graph_tool'}


# Calculate stats comparing ground truth communities and the discovered ones

In [19]:
import copy
from statistics import mean

In [20]:
def coverage_activities(executed_alg, groud_truth, nodes, communities=None, label=None):
    communities_p1 = executed_alg.communities
    communities_p2 = communities
    if groud_truth is not None:
        communities_p2 = groud_truth.communities
        
    if len(communities_p1) == 0:
        return None
        
    dict_p1 = get_ordered_communities(communities_p1, nodes, sort_events=True)
    dict_p2 = get_ordered_communities(communities_p2, nodes, sort_events=True)
    
    disc_stat = {}
    # compute num communities over expected ones
    disc_stat["partitions"] = len(dict_p1)
    disc_stat["partitions_truth"] = len(dict_p2)
    disc_stat["covered_partitions"] = len(dict_p1)/len(dict_p2)
    # if disc_stat["coverage_p1"] < 1:
    #     disc_stat["coverage_p1_overall_balance"] = "fewer"
    # elif disc_stat["coverage_p1"] > 1:
    #     disc_stat["coverage_p1_overall_balance"] = "higher"
    # else:
    #     disc_stat["coverage_p1_overall_balance"] = "match"
    disc_stat["stat_covered_partitions"] = \
        "fewer" if disc_stat["covered_partitions"] < 1 else \
        "higher" if disc_stat["covered_partitions"] > 1 else \
        "match"
    # Now search among all the discovered communities their coverage w.r.t. ground truth communities' events
    disc_stat["communities"] = []
    '''
    item["len"] = len(vals)
    item["vals"] = vals    
    item["evt"] = vals_str
    item["id"] = idx
    item["vals_hash"] = vals_hash
    '''
    for community in dict_p1:
        events = community["vals"]
    
        #coverage_cluster = []
        better_distance = 0
        better_coverage_cluster = 0
        coverage_community_idx = -1
        for community_truth in dict_p2:
            events_truth = community_truth["vals"]
            common_elements = list(set(events).intersection(set(events_truth)))
            all_elements = list(set(events).union(set(events_truth)))
            #coverage_cluster.append(len(common_elements)/len(events_truth))
            coverage_cluster = len(common_elements)/len(events_truth)
            distance = 2 * (len(common_elements)/len(all_elements))
            if better_coverage_cluster < coverage_cluster:
                better_coverage_cluster = coverage_cluster
                coverage_community_idx = community_truth["id"]
            if better_distance < distance:
                better_distance = distance
    
        d2 = copy.deepcopy(community)
        d2["max_coverage_cluster"] = better_coverage_cluster  
        d2["max_coverage_cluster_id"] = coverage_community_idx  
        d2["distance"] = better_distance  
        disc_stat["communities"].append(d2)
    
    # calculate the average of the max_coverage_cluster
    coverages_cluster = []
    all_distance = []
    for community in  disc_stat["communities"]:
        coverages_cluster.append(community["max_coverage_cluster"])
        all_distance.append(community["distance"])
    disc_stat["mean_coverage_partitions"] = mean(coverages_cluster)
    disc_stat["min_coverage_partitions"] = min(coverages_cluster)
    disc_stat["max_coverage_partitions"] = max(coverages_cluster)
    disc_stat["mean_distance"] = mean(all_distance)
    disc_stat["min_distance"] = min(all_distance)
    disc_stat["max_distance"] = max(all_distance)

    disc_stat["method_parameters"] = executed_alg.method_parameters
    
    if label is not None:
        disc_stat["label"] = label

    if groud_truth is not None:
        f1 = evaluation.f1(executed_alg, groud_truth)
        disc_stat["f1_score"] = f1.score
        disc_stat["f1_std"] = f1.std
    else:
        disc_stat["f1_score"] = None
        disc_stat["f1_std"] = None
   
    return disc_stat

In [22]:
#coms_congo = algorithms.congo(G, number_communities=14, height=2);G, nodes, nameIndex = build_cdlib_graph(dfg_graph)
#coms_conga = algorithms.congo(G, number_communities=14);G, nodes, nameIndex = build_cdlib_graph(dfg_graph)
coms_dcs = algorithms.dcs(G);G, nodes, nameIndex = build_cdlib_graph(dfg_graph)
coms_ebgc = algorithms.ebgc(G);G, nodes, nameIndex = build_cdlib_graph(dfg_graph)
coms_graph_entropy = algorithms.graph_entropy(G);G, nodes, nameIndex = build_cdlib_graph(dfg_graph)
com_lais2 = algorithms.lais2(G);G, nodes, nameIndex = build_cdlib_graph(dfg_graph)
coms_multicom = algorithms.multicom(G, seed_node=0);G, nodes, nameIndex = build_cdlib_graph(dfg_graph)
coms_umstmo = algorithms.umstmo(G);G, nodes, nameIndex = build_cdlib_graph(dfg_graph)
coms_percomvc = algorithms.percomvc(G);G, nodes, nameIndex = build_cdlib_graph(dfg_graph)

In [23]:
algs ={}

In [24]:
# algs['coms_aslpaw']=coms_aslpaw
# algs['coms_angel']=coms_angel
# algs['coms_coach']=coms_coach
#algs['coms_congo']=coms_conga
#algs['coms_conga']=coms_conga
# algs['coms_core_expansion']=coms_core_expansion
algs['coms_dcs']=coms_dcs
# algs['coms_demon']=coms_demon
# algs['coms_dpclus']=coms_dpclus
algs['coms_ebgc']=coms_ebgc
# algs['coms_ego_networks']=coms_ego_networks
# algs['coms_endntm']=coms_endntm
# algs['com_kclique']=com_kclique
algs['coms_graph_entropy']=coms_graph_entropy
#algs['coms_ipca']=coms_ipca
algs['com_lais2']=com_lais2
#algs['coms_lpam']=coms_lpam
#algs['coms_lpanni']=coms_lpanni
#algs['com_lfm']=com_lfm
algs['coms_multicom']=coms_multicom
# algs['coms_node_perception']=coms_node_perception
# algs['com_overlapping_seed_set_expansion']=com_overlapping_seed_set_expansion
algs['coms_umstmo']=coms_umstmo
algs['coms_percomvc']=coms_percomvc
# algs['coms_slpa']=coms_slpa
# algs['coms_wCommunity']=coms_wCommunity

In [25]:
list_algs = []

In [26]:
#algorithms.angel
item = {
    "alg" : algorithms.angel, "params" : [
        ["threshold", 0.1, 1, 0.1]
    ]
}; list_algs.append(item)

#algorithms.coach
item = {
    "alg" : algorithms.coach, "params" : [
        ["density_threshold", 0.1, 1, 0.1],
        ["affinity_threshold", 0.1, 1, 0.1],
        ["closeness_threshold", 0.1, 1, 0.1]
    ]
}; list_algs.append(item)

#algorithms.congo
# item = {
#     "alg" : algorithms.congo, "params" : [
#         ["height", 1, 10, 1]
#     ]
# }; list_algs.append(item)

#algorithms.core_expansion
item = {
    "alg" : algorithms.core_expansion, "params" : [
        ["tolerance", 0.0001, 0.0001, 0.001]
    ]
}; list_algs.append(item)

#algorithms.demon
item = {
    "alg" : algorithms.demon, "params" : [
        ["epsilon", 0.1, 1, 0.1]
    ]
}; list_algs.append(item)

#algorithms.dpclus
item = {
    "alg" : algorithms.dpclus, "params" : [
        ["d_threshold", 0.1, 2, 0.1],
        ["cp_threshold", 0.1, 2, 0.1]
    ]
}; list_algs.append(item)



In [27]:

#algorithms.ego_networks
item = {
    "alg" : algorithms.ego_networks, "params" : [
        ["level", 1, 17, 1]
    ]
}; list_algs.append(item)

#algorithms.endntm
item = {
    "alg" : algorithms.endntm, "params" : [
        ["epsilon", 1, 17, 1]
    ]
};# list_algs.append(item)

#algorithms.kclique
item = {
    "alg" : algorithms.kclique, "params" : [
        ["k", 3, 17, 1]
    ]
}; list_algs.append(item)

#algorithms.ipca
item = {
    "alg" : algorithms.ipca, "params" : [
        ["t_in", 0.1, 1, 0.1]
    ]
}; list_algs.append(item)

#algorithms.lemon
item = {
    "alg" : algorithms.lemon, "params" : [
        ["expand_step", 1, 17, 1],
        ["subspace_dim", 1, 17, 1],
        ["walk_steps", 1, 17, 1]
    ]
};# list_algs.append(item)

#algorithms.lpam
item = {
    "alg" : algorithms.lpam, "params" : [
        ["k", 1, 1, 17],
        ["subspace_dim", 1, 17, 1],
        ["threshold", 0.1, 1, 0.1]
    ]
}; list_algs.append(item)

#algorithms.lpanni
item = {
    "alg" : algorithms.lpanni, "params" : [
        ["threshold", 0.1, 1, 0.1]
    ]
}; list_algs.append(item)

#algorithms.lfm
item = {
    "alg" : algorithms.lfm, "params" : [
        ["alpha", 0.1, 1, 0.1]
    ]
}; list_algs.append(item)


In [28]:
#list_algs = []


In [29]:
#algorithms.node_perception
item = {
    "alg" : algorithms.node_perception, "params" : [
        ["threshold", 0.1, 1, 0.1],
        ["overlap_threshold", 0.1, 1, 0.1]
    ]
}; list_algs.append(item)

#algorithms.overlapping_seed_set_expansion
item = {
    "alg" : algorithms.overlapping_seed_set_expansion, "params" : [
        ["nruns", 13, 20, 1],
        ["alpha", 0.1, 2, 0.1],
        ["delta", 0.1, 1, 0.1]
    ]
};# list_algs.append(item)

# #algorithms.slpa
# item = {
#     "alg" : algorithms.slpa, "params" : [
#         ["t", 21, 30, 1],
#         ["r", 0.1, 1, 0.1]
#     ]
# }; list_algs.append(item)

#algorithms.walkscan
# item = {
#     "alg" : algorithms.walkscan, "params" : [
#         ["nb_steps", 2, 17, 1],
#         ["eps", 0.1, 1, 0.1],
#         ["min_samples", 3, 17, 1]
#     ]
# }; list_algs.append(item)

#algorithms.wCommunity
item = {
    "alg" : algorithms.wCommunity, "params" : [
        ["min_bel_degree", 0.1, 0.9, 0.1],
        ["threshold_bel_degree", 0.1, 0.9, 0.1]
    ]
}; list_algs.append(item)

In [30]:
for item in list_algs:
    G, nodes, nameIndex = build_cdlib_graph(dfg_graph)

    resolution = ensemble.Parameter(name="resolution", start=0.5, end=17, step=0.1)
    parameters = []
    for p in item["params"]:
        parameters.append(ensemble.Parameter(name=p[0], start=p[1], end=p[2], step=p[3]))

    idx = 0
    for coms in ensemble.grid_execution(graph=G, method=item["alg"], parameters=parameters):
        algorithm_key = coms.method_name + "_" + str(idx)
        idx = idx+1
        algs[algorithm_key] = coms
        print("added " + algorithm_key, coms.method_parameters)

added ANGEL_0 {'threshold': 0.1, 'min_community_size': 3}
added ANGEL_1 {'threshold': 0.2, 'min_community_size': 3}
added ANGEL_2 {'threshold': 0.30000000000000004, 'min_community_size': 3}
added ANGEL_3 {'threshold': 0.4, 'min_community_size': 3}
added ANGEL_4 {'threshold': 0.5, 'min_community_size': 3}
added ANGEL_5 {'threshold': 0.6, 'min_community_size': 3}
added ANGEL_6 {'threshold': 0.7000000000000001, 'min_community_size': 3}
added ANGEL_7 {'threshold': 0.8, 'min_community_size': 3}
added ANGEL_8 {'threshold': 0.9, 'min_community_size': 3}
added coach_0 {'density_threshold': 0.1, 'affinity_threshold': 0.1, 'closeness_threshold': 0.1}
added coach_1 {'density_threshold': 0.1, 'affinity_threshold': 0.1, 'closeness_threshold': 0.2}
added coach_2 {'density_threshold': 0.1, 'affinity_threshold': 0.1, 'closeness_threshold': 0.30000000000000004}
added coach_3 {'density_threshold': 0.1, 'affinity_threshold': 0.1, 'closeness_threshold': 0.4}
added coach_4 {'density_threshold': 0.1, 'affin

ValueError: Sample larger than population or is negative

In [31]:
# id_name, cover_partitions, stat coverage, mean, min, max
for alg, comms_alg in algs.items():
    alg_coverage = coverage_activities(comms_alg, ground_truth, nodes, label="overlapping")
    #print(f"{alg}\t{alg_coverage['covered_partitions']}\t{alg_coverage['stat_covered_partitions']}\t{alg_coverage['mean_coverage_partitions']}\t{alg_coverage['min_coverage_partitions']}\t{alg_coverage['max_coverage_partitions']}")

[0, 1, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 47, 48, 49, 50, 51, 52, 53, 54, 55, 58, 59, 61, 62, 63, 65]
[18, 19, 20, 21, 56, 57, 60]
frozenset({33, 9, 11, 49, 50, 51, 52, 53, 54, 55, 28, 31})
frozenset({64, 65, 33, 4, 11, 31, 53, 27, 28, 63})
frozenset({32, 33, 4, 9, 11, 27, 28, 29, 30, 31})
frozenset({0, 1, 2, 3, 4, 5, 6, 7, 8, 66})
frozenset({0, 8, 9, 18, 19, 22, 56, 57, 58})
frozenset({0, 8, 18, 19, 20, 21, 22, 23})
frozenset({0, 8, 18, 19, 22, 23, 56, 60})
frozenset({16, 17, 24, 14, 15})
frozenset({4, 38, 39, 40, 27})
frozenset({9, 10, 11, 12, 13})
frozenset({34, 35, 36, 37, 41})
frozenset({48, 42, 47})
frozenset({42, 43, 44})
frozenset({25, 26})
frozenset({59, 11})
frozenset({61, 62})
frozenset({45, 46})
[8, 9, 10, 17, 23, 24, 26, 27, 29, 30, 32, 33, 37, 41, 46, 49, 50, 54, 58, 62]
[6, 11, 13, 14, 16, 25, 31, 34, 36, 38, 40]
[18, 19, 20, 21, 56, 57, 60]
[0]
[1]
[2]
[3]
[4]
[5]

In [32]:
# list_algs = []
# #           name, start, stop, step 
# item = {
#     "alg" : "name",
#     "params" : [
#         ["name", 1, 1, 1]
#     ]
# }
# list_algs.append(item)


In [33]:
import pandas as pd

In [34]:
def get_stats(algs, label=None):
    structure_algs = []
    for alg, comms_alg in algs.items():
        alg_coverage = coverage_activities(comms_alg, ground_truth, nodes, label=label)
        if alg_coverage is None:
            continue

        my_item = {}
        my_item["alg"] = alg
        my_item["partitions"] = alg_coverage['partitions']
        my_item["truth"] = alg_coverage['partitions_truth']
        my_item["covered"] = alg_coverage['covered_partitions']
        my_item["matching"] = alg_coverage['stat_covered_partitions']
        my_item["mean_coverage"] = alg_coverage['mean_coverage_partitions']
        my_item["min_coverage"] = alg_coverage['min_coverage_partitions']
        my_item["max_coverage"] = alg_coverage['max_coverage_partitions']
        
        my_item["mean_distance"] = alg_coverage['mean_distance']
        my_item["min_distance"] = alg_coverage['min_distance']
        my_item["max_distance"] = alg_coverage['max_distance']
        
        
        my_item["f1_score"] = alg_coverage['f1_score']
        my_item["f1_std"] = alg_coverage['f1_std']
        
        if label is not None:
            my_item["label"] = label
        
        for param, value in alg_coverage['method_parameters'].items():
            my_item[param] = value

        structure_algs.append(my_item)
    return structure_algs

In [35]:
structure_algs = get_stats(algs, label="overlapping")

[0, 1, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 47, 48, 49, 50, 51, 52, 53, 54, 55, 58, 59, 61, 62, 63, 65]
[18, 19, 20, 21, 56, 57, 60]
frozenset({33, 9, 11, 49, 50, 51, 52, 53, 54, 55, 28, 31})
frozenset({64, 65, 33, 4, 11, 31, 53, 27, 28, 63})
frozenset({32, 33, 4, 9, 11, 27, 28, 29, 30, 31})
frozenset({0, 1, 2, 3, 4, 5, 6, 7, 8, 66})
frozenset({0, 8, 9, 18, 19, 22, 56, 57, 58})
frozenset({0, 8, 18, 19, 20, 21, 22, 23})
frozenset({0, 8, 18, 19, 22, 23, 56, 60})
frozenset({16, 17, 24, 14, 15})
frozenset({4, 38, 39, 40, 27})
frozenset({9, 10, 11, 12, 13})
frozenset({34, 35, 36, 37, 41})
frozenset({48, 42, 47})
frozenset({42, 43, 44})
frozenset({25, 26})
frozenset({59, 11})
frozenset({61, 62})
frozenset({45, 46})
[8, 9, 10, 17, 23, 24, 26, 27, 29, 30, 32, 33, 37, 41, 46, 49, 50, 54, 58, 62]
[6, 11, 13, 14, 16, 25, 31, 34, 36, 38, 40]
[18, 19, 20, 21, 56, 57, 60]
[0]
[1]
[2]
[3]
[4]
[5]

In [36]:
dataframe = pd.DataFrame(structure_algs) 

# Crisp (non overlapping) algorithms

In [37]:
G, nodes, nameIndex = build_cdlib_graph(dfg_graph)

In [38]:
#com_agdl = algorithms.agdl(G, number_communities=3, kc=4);G, nodes, nameIndex = build_cdlib_graph(dfg_graph)
#com_bayan = algorithms.bayan(G);G, nodes, nameIndex = build_cdlib_graph(dfg_graph)
#coms_cpm = algorithms.cpm(G, resolution_parameter=1);G, nodes, nameIndex = build_cdlib_graph(dfg_graph)
#coms_der = algorithms.der(G, 3, .00001, 50)
com_eigenvector = algorithms.eigenvector(G);G, nodes, nameIndex = build_cdlib_graph(dfg_graph)
com_em = algorithms.em(G, k=17);G, nodes, nameIndex = build_cdlib_graph(dfg_graph)
#com_ga = algorithms.ga(G, population=2, generation=50);G, nodes, nameIndex = build_cdlib_graph(dfg_graph)
#com_gdmp2 = algorithms.gdmp2(G);G, nodes, nameIndex = build_cdlib_graph(dfg_graph)
#com_girvan_newman = algorithms.girvan_newman(G, level=3);G, nodes, nameIndex = build_cdlib_graph(dfg_graph)
coms_greedy_modularity = algorithms.greedy_modularity(G, weight="weight");G, nodes, nameIndex = build_cdlib_graph(dfg_graph)
#coms_head_tail = algorithms.head_tail(G, head_tail_ratio=0.8);G, nodes, nameIndex = build_cdlib_graph(dfg_graph)
coms_infomap = algorithms.infomap(G);G, nodes, nameIndex = build_cdlib_graph(dfg_graph)
coms_leiden = algorithms.leiden(G, weights="weight");G, nodes, nameIndex = build_cdlib_graph(dfg_graph)
#coms_louvain = algorithms.louvain(G, weight='weight', resolution=3);G, nodes, nameIndex = build_cdlib_graph(dfg_graph)
#coms_lswl_plus = algorithms.lswl_plus(G, detect_overlap=False)
#coms_mcode = algorithms.mcode(G, weights="weight");G, nodes, nameIndex = build_cdlib_graph(dfg_graph)
coms_paris = algorithms.paris(G);G, nodes, nameIndex = build_cdlib_graph(dfg_graph)
#coms_pycombo = algorithms.pycombo(G, weight="weight", max_communities=14);G, nodes, nameIndex = build_cdlib_graph(dfg_graph)
#coms_rber_pots = algorithms.rb_pots(G, weights="weight")
#ßcoms_surprise_communities = algorithms.surprise_communities(G, weights="weight")
#ßcoms_threshold_clustering = algorithms.threshold_clustering(G)

                       

In [39]:
algs_crisp= {}

## Louvain test

In [40]:
from cdlib import ensemble

In [41]:
G, nodes, nameIndex = build_cdlib_graph(dfg_graph)

resolution = ensemble.Parameter(name="resolution", start=0.5, end=17, step=0.1)

idx = 0
for coms in ensemble.grid_execution(graph=G, method=algorithms.louvain, parameters=[resolution]):
    algorithm_key = coms.method_name + "_" + str(idx)
    idx = idx+1
    algs_crisp[algorithm_key] = coms

In [42]:
G, nodes, nameIndex = build_cdlib_graph(dfg_graph)

resolution_parameter = ensemble.Parameter(name="resolution_parameter", start=0.1, end=17, step=0.1)

idx = 0
for coms in ensemble.grid_execution(graph=G, method=algorithms.cpm, parameters=[resolution_parameter]):
    #print(coms.method_name, coms.method_parameters, "\n", coms.communities, "\n")
    #print(coms.method_parameters["resolution"], "\t", evaluation.f1(coms, ground_truth).score)
    algorithm_key = coms.method_name + "_" + str(idx)
    idx = idx+1
    algs_crisp[algorithm_key] = coms


In [43]:
G, nodes, nameIndex = build_cdlib_graph(dfg_graph)

population = ensemble.Parameter(name="population", start=300, end=400, step=50)
generation = ensemble.Parameter(name="generation", start=30, end=40, step=5)

idx = 0
for coms in ensemble.grid_execution(graph=G, method=algorithms.ga, parameters=[population, generation]):
    print(coms.method_name, coms.method_parameters)#, "\n", coms.communities, "\n")
    #print(coms.method_parameters["resolution"], "\t", evaluation.f1(coms, ground_truth).score)
    algorithm_key = coms.method_name + "_" + str(idx)
    idx = idx+1
    algs_crisp[algorithm_key] = coms


ga {'population': 300, 'generation': 30, 'r': 1.5}
ga {'population': 300, 'generation': 35, 'r': 1.5}
ga {'population': 350, 'generation': 30, 'r': 1.5}
ga {'population': 350, 'generation': 35, 'r': 1.5}


In [44]:
# G, nodes, nameIndex = build_cdlib_graph(dfg_graph)
# 
# min_threshold = ensemble.Parameter(name="min_threshold", start=0.5, end=2, step=0.1)
# 
# idx = 0
# for coms in ensemble.grid_execution(graph=G, method=algorithms.gdmp2, parameters=[min_threshold]):
#     #print(coms.method_name, coms.method_parameters, "\n", coms.communities, "\n")
#     #print(coms.method_parameters["resolution"], "\t", evaluation.f1(coms, ground_truth).score)
#     algorithm_key = coms.method_name + "_" + str(idx)
#     idx = idx+1
#     algs_crisp[algorithm_key] = coms

In [45]:
G, nodes, nameIndex = build_cdlib_graph(dfg_graph)

level = ensemble.Parameter(name="level", start=1, end=17, step=1)

idx = 0
for coms in ensemble.grid_execution(graph=G, method=algorithms.girvan_newman, parameters=[level]):
    #print(coms.method_name, coms.method_parameters, "\n", coms.communities, "\n")
    #print(coms.method_parameters["resolution"], "\t", evaluation.f1(coms, ground_truth).score)
    algorithm_key = coms.method_name + "_" + str(idx)
    idx = idx+1
    algs_crisp[algorithm_key] = coms

In [46]:
G, nodes, nameIndex = build_cdlib_graph(dfg_graph)

head_tail_ratio = ensemble.Parameter(name="head_tail_ratio", start=0.1, end=1, step=0.1)

idx = 0
for coms in ensemble.grid_execution(graph=G, method=algorithms.head_tail, parameters=[head_tail_ratio]):
    #print(coms.method_name, coms.method_parameters, "\n", coms.communities, "\n")
    #print(coms.method_parameters["resolution"], "\t", evaluation.f1(coms, ground_truth).score)
    algorithm_key = coms.method_name + "_" + str(idx)
    idx = idx+1
    algs_crisp[algorithm_key] = coms

In [47]:
G, nodes, nameIndex = build_cdlib_graph(dfg_graph)

weight_threshold = ensemble.Parameter(name="weight_threshold", start=0.1, end=1, step=0.1)

idx = 0
for coms in ensemble.grid_execution(graph=G, method=algorithms.mcode, parameters=[weight_threshold]):
    #print(coms.method_name, coms.method_parameters, "\n", coms.communities, "\n")
    #print(coms.method_parameters["resolution"], "\t", evaluation.f1(coms, ground_truth).score)
    algorithm_key = coms.method_name + "_" + str(idx)
    idx = idx+1
    algs_crisp[algorithm_key] = coms

In [48]:
#coms_pycombo = algorithms.pycombo(G, weight="weight", max_communities=14);G, nodes, nameIndex = build_cdlib_graph(dfg_graph)
G, nodes, nameIndex = build_cdlib_graph(dfg_graph)

modularity_resolution = ensemble.Parameter(name="modularity_resolution", start=0.5, end=5, step=0.1)

idx = 0
for coms in ensemble.grid_execution(graph=G, method=algorithms.pycombo, parameters=[modularity_resolution]):
    #print(coms.method_name, coms.method_parameters, "\n", coms.communities, "\n")
    #print(coms.method_parameters["resolution"], "\t", evaluation.f1(coms, ground_truth).score)
    algorithm_key = coms.method_name + "_" + str(idx)
    idx = idx+1
    algs_crisp[algorithm_key] = coms

In [49]:
# #com_agdl = algorithms.agdl(G, number_communities=3, kc=4);G, nodes, nameIndex = build_cdlib_graph(dfg_graph)
# ##coms_pycombo = algorithms.pycombo(G, weight="weight", max_communities=14);G, nodes, nameIndex = build_cdlib_graph(dfg_graph)
# G, nodes, nameIndex = build_cdlib_graph(dfg_graph)
# 
# number_communities = ensemble.Parameter(name="number_communities", start=3, end=17, step=1)
# kc = ensemble.Parameter(name="kc", start=2, end=9, step=1)
# 
# idx = 0
# for coms in ensemble.grid_execution(graph=G, method=algorithms.agdl, parameters=[number_communities, kc]):
#     #print(coms.method_name, coms.method_parameters, "\n", coms.communities, "\n")
#     #print(coms.method_parameters["resolution"], "\t", evaluation.f1(coms, ground_truth).score)
#     algorithm_key = coms.method_name + "_" + str(idx)
#     idx = idx+1
#     algs_crisp[algorithm_key] = coms

In [50]:
#algs_crisp['com_agdl']=com_agdl
#algs_crisp['coms_cpm']=coms_cpm
algs_crisp['com_eigenvector']=com_eigenvector
algs_crisp['com_em']=com_em
#algs_crisp['com_ga']=com_ga
#algs_crisp['com_gdmp2']=com_gdmp2
#algs_crisp['com_girvan_newman']=com_girvan_newman
algs_crisp['coms_greedy_modularity']=coms_greedy_modularity
#algs_crisp['coms_head_tail']=coms_head_tail
algs_crisp['coms_infomap']=coms_infomap
algs_crisp['coms_leiden']=coms_leiden
#algs_crisp['coms_louvain']=coms_louvain
#algs_crisp['coms_mcode']=coms_mcode
algs_crisp['coms_paris']=coms_paris
#algs_crisp['coms_pycombo']=coms_pycombo

In [51]:
structure_algs_crips = get_stats(algs_crisp, label="crisp")

[4, 24, 38, 39, 40]
[11, 12, 13, 33, 59]
[27, 28, 29, 49, 55]
[34, 35, 36, 37, 41]
[5, 6, 7, 66]
[30, 31, 32, 53]
[50, 51, 52, 54]
[0, 8, 23]
[1, 2, 3]
[9, 10, 58]
[18, 19, 22]
[42, 43, 44]
[56, 57, 60]
[63, 64, 65]
[14, 15]
[16, 17]
[20, 21]
[25, 26]
[45, 46]
[47, 48]
[61, 62]
frozenset({33, 9, 11, 49, 50, 51, 52, 53, 54, 55, 28, 31})
frozenset({64, 65, 33, 4, 11, 31, 53, 27, 28, 63})
frozenset({32, 33, 4, 9, 11, 27, 28, 29, 30, 31})
frozenset({0, 1, 2, 3, 4, 5, 6, 7, 8, 66})
frozenset({0, 8, 9, 18, 19, 22, 56, 57, 58})
frozenset({0, 8, 18, 19, 20, 21, 22, 23})
frozenset({0, 8, 18, 19, 22, 23, 56, 60})
frozenset({16, 17, 24, 14, 15})
frozenset({4, 38, 39, 40, 27})
frozenset({9, 10, 11, 12, 13})
frozenset({34, 35, 36, 37, 41})
frozenset({48, 42, 47})
frozenset({42, 43, 44})
frozenset({25, 26})
frozenset({59, 11})
frozenset({61, 62})
frozenset({45, 46})
[4, 24, 38, 39, 40]
[11, 12, 13, 33, 59]
[27, 28, 29, 49, 55]
[34, 35, 36, 37, 41]
[5, 6, 7, 66]
[30, 31, 32, 53]
[50, 51, 52, 54]
[0, 

In [52]:
dataframe2 = pd.DataFrame(structure_algs_crips) 

In [53]:
dataframe_all = pd.concat([dataframe, dataframe2])

In [54]:
dataframe_all.sort_values(["f1_score", "f1_std", "mean_coverage", "partitions"],  ascending =[False, False, False, False], inplace=True)

In [55]:
dataframe_all.to_csv('abstraction/algorithms_stats.csv')

In [56]:
dataframe_all.dtypes

alg                      object
partitions                int64
truth                     int64
covered                 float64
matching                 object
mean_coverage           float64
min_coverage            float64
max_coverage            float64
mean_distance           float64
min_distance            float64
max_distance            float64
f1_score                float64
f1_std                  float64
label                    object
                         object
seeds                   float64
threshold               float64
min_community_size      float64
density_threshold       float64
affinity_threshold      float64
closeness_threshold     float64
epsilon                 float64
min_com_size            float64
d_threshold             float64
cp_threshold            float64
level                   float64
k                       float64
t_in                    float64
alpha                   float64
overlap_threshold       float64
min_bel_degree          float64
threshol

In [57]:
dataframe_all.to_excel("abstraction/algorithms_stats.xlsx")


# Saving communities

Selected the algorithm that perform better, save the communities to a dedicated file to be used with the CEP engine

In [58]:
from conversion import write_communities

In [59]:
selected_alg = algs_crisp['coms_infomap']

In [60]:
write_communities(selected_alg, nodes, filename="abstraction/communities_coms_infomap.txt", events_count=get_events_count(event_log))

Community n.  0
Set community label to go_wardrobe with higher counting of 82
Set community label to change_clothes with higher counting of 112
Community n.  1
Set community label to go_kitchen_shelf with higher counting of 174
Set community label to get_glass with higher counting of 236
Set community label to get_water with higher counting of 240
Community n.  2
Set community label to go_bathroom_sink with higher counting of 133
Set community label to wc_do with higher counting of 134
Community n.  3
Set community label to wash_hands with higher counting of 192
Community n.  4
Set community label to go_tv with higher counting of 59
Community n.  5
Set community label to go_windows with higher counting of 6
Community n.  6
Set community label to brush_teeth with higher counting of 30
Set community label to go_bed with higher counting of 47
Community n.  7
Set community label to go_computer with higher counting of 86
Community n.  8
Set community label to get_bread with higher counting 

{0: ['go_wardrobe',
  'change_clothes',
  'go_shoe_shelf',
  'dress_up_outdoor',
  'go_workplace',
  'work',
  'dress_down_outdoor',
  'finish_walk',
  'go_outside',
  'get_food',
  'walk_outside'],
 1: ['go_kitchen_shelf',
  'get_glass',
  'go_kitchen_sink',
  'get_water',
  'drink_water',
  'put_plate_to_sink',
  'pack_goods',
  'wash_dishes'],
 2: ['go_bathroom_sink', 'switch_computer_off', 'go_wc', 'wc_do', 'wc_flush'],
 3: ['wash_hands',
  'go_fridge',
  'get_food_from_fridge',
  'get_ingredients_from_fridge',
  'put_meal_to_fridge'],
 4: ['go_tv', 'switch_tv_on', 'go_tv_chair', 'do_watch_tv', 'switch_tv_off'],
 5: ['go_windows',
  'raise_blinds',
  'open_windows',
  'close_windows',
  'lower_blinds'],
 6: ['brush_teeth', 'go_bed', 'sleep_in_bed', 'go_to start'],
 7: ['go_computer',
  'switch_computer_on',
  'go_computer_chair',
  'use_the_computer'],
 8: ['get_bread', 'go_dining_table', 'eat_cold_meal', 'eat_warm_meal'],
 9: ['get_ingredients_from_shelf', 'go_oven', 'use_oven', '