In [None]:
from pm4py.objects.log.importer.xes import importer as xes_importer
from pm4py.algo.discovery.dfg import algorithm as dfg_discovery
#from clustering_utils import  build_cdlib_graph
from utils import pre_processing_xes
import pm4py
import networkx as nx

In [None]:
# LOADING DATA
# event log not partitioned (single trace with a list of events)
event_log = xes_importer.apply('BP-Meets-IoT2020/d21p1/EventLogXESNoSegment.xes') # ('sim_22d1p_22/EventLogXESNoSegment.xes')
# event log partitioned (list of named traces)
event_log_labelled = xes_importer.apply('BP-Meets-IoT2020/d21p1/EventLogXES_fixed.xes') # ('sim_22d1p_22/EventLogXES.xes')

In [None]:
# Pre-processing: removing noises
event_log, event_log_labelled = pre_processing_xes(event_log, event_log_labelled)

In [None]:
# Export to CSV pre-processed data in order to check structure is preserved
dataframe_labelled = pm4py.convert_to_dataframe(event_log_labelled)
#
dataframe_labelled.to_csv('abstraction/EventLogXES_fixed.csv')

In [None]:
# Build DFG with PM4PY
dfg_graph = dfg_discovery.apply(event_log)

In [None]:
from clustering_utils import  build_cdlib_graph

In [None]:
G, nodes, nameIndex = build_cdlib_graph(dfg_graph)

# Ground truth
ground truth which is represented by the communities present in the log file labeled with the names of the activities.


In [None]:
from clustering_utils import  build_ground_truth

In [None]:
ground_truth, ground_truth_dictionary =  build_ground_truth(event_log_labelled, nameIndex, nodes)    

# Algorithms evaluation
Often it make sense to execute a given CD algorithm multiple times, varying its parameters, so to identify the optimal configuration w.r.t. a given fitness score.
We now experiment several algorithms: louvain, leiden, infomap, ...

In [None]:
from cdlib import algorithms

In [None]:
import hashlib

def comm_to_names(communities, nodes, sort_events=False):
    print("*** Communities to names")
    # Create structure for Segmented log comparison with resulting communities
    idx = 1
    for community in communities:
        #print(community)
        vals = []
        for i in community:
            vals.append(nodes[i])
            #print(f"Community [{idx}]\t{nodes[i]}")
        if sort_events:            
            vals.sort()
        len_events = len(vals)
        vals_str = ",".join(vals)
        vals_hash = hashlib.md5(vals_str.encode()).hexdigest()
        print(f"Community [{idx}]\t{len_events}\t{vals_hash}\t" + ",".join(vals))
        idx = idx + 1
    print("*******\n")


In [None]:
comm_louvain = algorithms.louvain(G, weight='weight', resolution=1.)

In [None]:
comm_to_names(comm_louvain.communities, nodes, sort_events=True)

In [None]:
#comm_louvain.communities

In [None]:
comm_louvain = algorithms.louvain(G, weight='weight', resolution=2)
comm_to_names(comm_louvain.communities, nodes, sort_events=True)

In [None]:
comm_louvain = algorithms.louvain(G, weight='weight', resolution=3)
comm_to_names(comm_louvain.communities, nodes, sort_events=True)

In [None]:
comm_leiden = algorithms.leiden(G)
comm_to_names(comm_leiden.communities, nodes, sort_events=True)

In [None]:
comm_infomap = algorithms.infomap(G)#, flags="--use-node-weights-as-flow")
comm_to_names(comm_infomap.communities, nodes, sort_events=True)

In [None]:
comm_der = algorithms.der(G, 500, .1, 100)
comm_to_names(comm_der.communities, nodes, sort_events=True)

In [None]:
comm_rb_pots = algorithms.rb_pots(G, weights="weight", resolution_parameter=3)
comm_to_names(comm_rb_pots.communities, nodes, sort_events=True)

In [None]:
comm_surprise_communities = algorithms.surprise_communities(G)#, weights="weight")
comm_to_names(comm_surprise_communities.communities, nodes, sort_events=True)

In [None]:
comm_frc_fgsn = algorithms.frc_fgsn(G, theta=0, eps=0.5, r=3)
comm_to_names(comm_frc_fgsn.communities, nodes, sort_events=True)

In [None]:
from cdlib import evaluation

In [None]:
evaluation.f1(comm_frc_fgsn,ground_truth)

In [None]:
comm_louvain = algorithms.louvain(G, weight='weight', resolution=3, randomize=True)

In [None]:
evaluation.size(G, comm_louvain)

In [None]:
evaluation.f1(comm_louvain,ground_truth)

In [None]:
evaluation.normalized_mutual_information(comm_louvain,comm_infomap)


In [None]:
evaluation.f1(comm_infomap,ground_truth)


In [None]:
evaluation.omega(comm_infomap,comm_infomap)

In [None]:
from cdlib import ensemble

resolution = ensemble.Parameter(name="resolution", start=0.7, end=9, step=0.1)

for coms in ensemble.grid_execution(graph=G, method=algorithms.louvain, parameters=[resolution]):
    print(coms.method_name, coms.method_parameters, "\n", coms.communities, "\n")

In [None]:
resolution = ensemble.Parameter(name="resolution", start=1, end=9, step=0.1)
#randomize = ensemble.BoolParameter(name="randomize")

coms, scoring = ensemble.grid_search(graph=G, method=algorithms.louvain,
                                                     parameters=[resolution],
                                                     quality_score=evaluation.erdos_renyi_modularity,
                                                     aggregate=max)

print("Communities:\n %s \nConfiguration: %s \nScoring: %s" %(coms.communities, coms.method_parameters, scoring))

In [None]:
comm_frc_fgsn = algorithms.frc_fgsn(G, theta=0.001, eps=0.5, r=3)
#comm_to_names(comm_frc_fgsn.communities, nodes, sort_events=True)
evaluation.f1(comm_frc_fgsn,ground_truth)

# Overlapping

In [None]:
dcs = algorithms.dcs(G)
comm_to_names(dcs.communities, nodes, sort_events=True)

In [None]:
evaluation.f1(dcs,ground_truth)
