# Clustering

## Imports

In [17]:
import os
from subprocess import Popen

import re

import pandas as pd

from sklearn.metrics import silhouette_score

## Inputs

In [13]:
lpms_directory = os.path.abspath("./data/lpms") # this is the directory that includes the sets of local process models
scripts_directory = os.path.abspath("./scripts/clustering")

res_directory = os.path.abspath("results") # this it the directory where results will be outputted

## Hierarchical clustering

In [14]:
# cluster
cl_processes = []
for lpm_set_name in os.listdir(lpms_directory):
    distances_dir = os.path.join(res_directory, lpm_set_name, "distances")
    clustering_dir = os.path.join(res_directory, lpm_set_name, "clustering")
    if not os.path.exists(clustering_dir):
        os.mkdir(clustering_dir)
    for file in os.listdir(distances_dir):
        match_measure = re.search('model_(.*).csv', os.path.basename(file))
        if match_measure is not None:
            measure = match_measure.group(1)
            cl_processes.append(Popen(["python", os.path.join(scripts_directory, "clustering_one.py"), measure, os.path.join(distances_dir, file), clustering_dir]))

try:
    exitcodes = [p.wait() for p in cl_processes]
    if not all(el == 0 for el in exitcodes):
        print("Some clustering scripts did not finish successfully: " + str(exitcodes))
except KeyboardInterrupt:
    [p.kill() for p in cl_processes]

python C:\Users\peeva\My\code\my\CombiningLPMDandPMSM\scripts\clustering\clustering_one.py efg_distances C:\Users\peeva\My\code\my\CombiningLPMDandPMSM\results\artificialBig\distances\model_efg_distances.csv C:\Users\peeva\My\code\my\CombiningLPMDandPMSM\results\artificialBig\clustering
python C:\Users\peeva\My\code\my\CombiningLPMDandPMSM\scripts\clustering\clustering_one.py full_trace_matching_distances C:\Users\peeva\My\code\my\CombiningLPMDandPMSM\results\artificialBig\distances\model_full_trace_matching_distances.csv C:\Users\peeva\My\code\my\CombiningLPMDandPMSM\results\artificialBig\clustering
python C:\Users\peeva\My\code\my\CombiningLPMDandPMSM\scripts\clustering\clustering_one.py ged_distances C:\Users\peeva\My\code\my\CombiningLPMDandPMSM\results\artificialBig\distances\model_ged_distances.csv C:\Users\peeva\My\code\my\CombiningLPMDandPMSM\results\artificialBig\clustering
python C:\Users\peeva\My\code\my\CombiningLPMDandPMSM\scripts\clustering\clustering_one.py node_distance

## Calculate Silhouette Scores

In [19]:
measures = {
    "efg": "efg",
    "full_trace_matching": "full",
    "ged": "ged",
    "node": "node",
    "transition_label": "transition"
}
inv_measures = {v: k for k, v in measures.items()}

In [21]:
# calculate silhouette scores
cl_res_df = pd.DataFrame(columns=["Event Log", "Distance Threshold", "Measure", "Num Clusters"] + list(
    measures.values()))  # create main clustering dataframe

for f_log in os.listdir(res_directory):
    # import distances for all measure pairs
    distances = {}
    distances_dir = os.path.join(res_directory, f_log, "distances")
    for f_dist in os.listdir(distances_dir):
        if f_dist.endswith("distances.csv"):
            match_measure = re.search('model_(.*)_distances.csv', os.path.basename(f_dist))
            if match_measure is not None:
                measure = match_measure.group(1)
                dist_df = pd.read_csv(os.path.join(distances_dir, f_dist), index_col=0).fillna(1)
                np_dist = dist_df.to_numpy()

                distances[(f_log, measure)] = np_dist

    clustering_dir = os.path.join(res_directory, f_log, "clustering")

    for f2 in os.listdir(clustering_dir):
        key = tuple(re.split("_", re.search("clustering_hierarchical_(.*)_distances.csv", f2).groups()[0]))

        # read clustering results
        cl_res = os.path.join(clustering_dir, f2)
        cl_df = pd.read_csv(os.path.abspath(cl_res), index_col=0)
        if cl_df["Labels"].nunique() < 2 or cl_df["Labels"].nunique() >= len(cl_df):
            continue # silhouette score can not be calculated

        num_clusters = len(cl_df["Labels"].unique())
        distance_threshold = key[1]
        measure = key[2]
        log = f_log

        # calculate silhouette scores
        ss_scores = []
        for ss_measure in measures:
            ss = silhouette_score(distances[(log, ss_measure)], cl_df["Labels"], metric="precomputed")
            ss_scores.append(ss)

        # append result to final df
        cl_res_df.loc[len(cl_res_df)] = [log, distance_threshold, measure, num_clusters] + ss_scores

In [22]:
# save complete silhouette scores

ss_dir = os.path.join(res_directory, "ss")
if not os.path.exists(ss_dir):
    os.mkdir(ss_dir)
cl_res_df.to_csv(os.path.join(ss_dir, "complete_ss.csv"), index=False)