# Clustering

## Imports

In [1]:
import os
from subprocess import Popen

import re

import pandas as pd

from sklearn.metrics import silhouette_score

## Inputs

In [16]:
lpms_directory = os.path.abspath("./data/lpms") # this is the directory that includes the sets of local process models
scripts_directory = os.path.abspath("./scripts/clustering")

res_directory = os.path.abspath("results/set_wise") # this it the directory where results will be outputted

## Hierarchical clustering

In [9]:
# cluster
cl_processes = []
for lpm_set_name in os.listdir(lpms_directory):
    distances_dir = os.path.join(res_directory, lpm_set_name, "distances")
    clustering_dir = os.path.join(res_directory, lpm_set_name, "clustering")
    if not os.path.exists(clustering_dir):
        os.mkdir(clustering_dir)
    for file in os.listdir(distances_dir):
        match_measure = re.search('model_(.*)_distances.csv', os.path.basename(file))
        if match_measure is not None:
            measure = match_measure.group(1)
            cl_processes.append(Popen(["python", os.path.join(scripts_directory, "clustering_one.py"), measure, os.path.join(distances_dir, file), clustering_dir]))

try:
    exitcodes = [p.wait() for p in cl_processes]
    if not all(el == 0 for el in exitcodes):
        print("Some clustering scripts did not finish successfully: " + str(exitcodes))
except KeyboardInterrupt:
    [p.kill() for p in cl_processes]

## Calculate Silhouette Scores

In [10]:
measures = {
    "efg": "efg",
    "full_trace_matching": "full",
    "ged": "ged",
    "node": "node",
    "transition_label": "transition"
}
inv_measures = {v: k for k, v in measures.items()}

In [None]:
# calculate silhouette scores
cl_res_df = pd.DataFrame(columns=["Event Log", "Distance Threshold", "Measure", "Num Clusters", "Iteration"] + list(
    measures.values()))  # create main clustering dataframe

for f_log in os.listdir(res_directory):
    # import distances for all measure pairs
    distances = {}
    distances_dir = os.path.join(res_directory, f_log, "distances")
    for f_dist in os.listdir(distances_dir):
        if f_dist.endswith("distances.csv"):
            match_measure = re.search('model_(.*)_distances.csv', os.path.basename(f_dist))
            if match_measure is not None:
                measure = match_measure.group(1)
                dist_df = pd.read_csv(os.path.join(distances_dir, f_dist), index_col=0).fillna(1)
                np_dist = dist_df.to_numpy()

                distances[(f_log, measure)] = np_dist

    clustering_dir = os.path.join(res_directory, f_log, "clustering")

    for f_iter in os.listdir(clustering_dir):
        iter_clustering_dir = os.path.join(clustering_dir, f_iter)
        for f2 in os.listdir(iter_clustering_dir):
            regex_res = re.search("clustering_hierarchical_(.*).csv", f2)
            if regex_res is None:
                print(f2)
                continue
            key = tuple(re.split("_", regex_res.groups()[0]))

            # read clustering results
            cl_res = os.path.join(iter_clustering_dir, f2)
            cl_df = pd.read_csv(os.path.abspath(cl_res), index_col=0)
            if cl_df["Labels"].nunique() < 2 or cl_df["Labels"].nunique() >= len(cl_df):
                continue # silhouette score can not be calculated

            num_clusters = len(cl_df["Labels"].unique())
            distance_threshold = key[1]
            measure = key[2]
            log = f_log

            # calculate silhouette scores
            ss_scores = []
            for ss_measure in measures:
                ss = silhouette_score(distances[(log, ss_measure)], cl_df["Labels"], metric="precomputed")
                ss_scores.append(ss)

            # append result to final df
            cl_res_df.loc[len(cl_res_df)] = [log, distance_threshold, measure, num_clusters, f_iter] + ss_scores

In [20]:
# save complete silhouette scores

ss_dir = os.path.abspath("results/ss")
if not os.path.exists(ss_dir):
    os.mkdir(ss_dir)
cl_res_df.to_csv(os.path.join(ss_dir, "complete_ss.csv"), index=False)

In [21]:
cl_res_df

Unnamed: 0,Event Log,Distance Threshold,Measure,Num Clusters,Iteration,efg,full,ged,node,transition
0,Road_Traffic_Fine_Management_Process,0.1,efg,52,725,0.419928,0.213831,-0.009251,0.027040,-0.030000
1,Road_Traffic_Fine_Management_Process,0.4,full,20,725,-0.026918,0.289116,-0.066074,-0.119204,-0.308793
2,Road_Traffic_Fine_Management_Process,0.8,full,5,725,0.095055,0.153358,0.067034,0.080187,0.025933
3,Road_Traffic_Fine_Management_Process,0.7,efg,4,725,0.227394,0.133882,0.217704,0.223866,0.323858
4,Road_Traffic_Fine_Management_Process,0.2,efg,27,725,0.456786,0.169013,0.156027,0.228099,0.150000
...,...,...,...,...,...,...,...,...,...,...
436995,Hospital Billing - Event Log,0.4,node,10,26,0.526946,0.245938,0.220801,0.444703,0.685533
436996,Hospital Billing - Event Log,0.4,efg,3,26,0.306949,0.191029,0.215609,0.336457,0.403731
436997,Hospital Billing - Event Log,0.1,transition,17,26,0.670775,0.256277,0.252427,0.502523,1.000000
436998,Hospital Billing - Event Log,0.5,node,9,26,0.503985,0.257692,0.241956,0.437656,0.653824
