# Path Config & Global Imports

In [None]:
# ======================================================
# 0) Environment Setup MUST come first
# ======================================================
import os
os.environ["OMP_NUM_THREADS"] = "1"   # MUST be before numpy/sklearn imports


# ======================================================
# 1) Imports & Global Config
# ======================================================
# Standard library
import random
from pathlib import Path

# Numerical / Data handling
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
from IPython.display import display

# ML / NLP
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from minisom import MiniSom
from hdbscan import HDBSCAN

# Reproducibility
GLOBAL_SEED = 42
random.seed(GLOBAL_SEED)
os.environ["PYTHONHASHSEED"] = str(GLOBAL_SEED)

# Project path
BASE_PATH = Path.cwd()
print("Working in:", BASE_PATH)

# Warnings cleanup
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Load clustering utilities
import cluster_utils as cu
print("Imported cluster_utils.py successfully.")


# Load Data and Create Trace Strings for Vectorization

In [None]:
# ======================================================
# Vectorization & Clustering Pipeline (Annotated Version)
# ======================================================
# This script loads event-log data, constructs textual variants of cases,
# encodes them using TF-IDF and Doc2Vec, reduces dimensionality when needed,
# and runs several clustering algorithms (KMeans, SOM, HDBSCAN). Metrics and
# cluster-level summaries are also computed.


# ======================================================
# 2) Load Event Log Data
# ======================================================

log_path = BASE_PATH / "df_sampled_200_cases.csv"

df = pd.read_csv(log_path)

# Parse timestamps
df["time:timestamp"] = pd.to_datetime(df["time:timestamp"], errors="coerce")
df = df.dropna(subset=["time:timestamp"])
df = df.sort_values(["case:concept:name", "time:timestamp"]).copy()

print("Log loaded with", len(df), "events.")
df.head()





## Extract Trace Sequences

In [None]:
# ======================================================
# 3) Extract trace sequences ‚Üí FILTERED
# ======================================================

filtered = cu.extract_trace_sequences(
    df,
    case_col="case:concept:name",
    activity_col="concept:name",
    timestamp_col="time:timestamp",
    min_variant_freq=2
)

print("Extracted", len(filtered), "trace sequences.")

# PM4Py downstream requires case IDs as index
filtered = filtered.set_index("case:concept:name")
filtered.head()



## Vectorize trace sequences

In [None]:
# ======================================================
# 4) Vectorization (TF-IDF + SVD + Doc2Vec)
# ======================================================

encoders, artifacts = cu.vectorize(
    filtered["trace_str"],   # <‚Äî FIXED: was seqs
    method="both"
)

for name, X in encoders.items():
    print(name, X.shape)

X = encoders["TFIDF_SVD"]
print("Using TFIDF_SVD:", X.shape)


## TFIDF Clustering

In [None]:
# ======================================================
# 5) KMeans Sweep ‚Üí Best K
# ======================================================

K_RANGE = range(2, 16)

best_km, km_results = cu.find_num_clusters(
    X,
    K_RANGE,
    cluster_algo="kmeans"
)

best_km


In [None]:
# Plot silhouette curve
cu.plot_silhouette_trend(km_results, "KMeans Silhouette Trend (TF-IDF)")


In [None]:
# Final KMeans clustering
chosen_k = best_km["k"]

labels, model = cu.cluster_traces(
    X,
    num_clusters=chosen_k,
    cluster_algo="kmeans"
)

print("Assigned clusters:", set(labels))

cluster_stats = cu.mine_from_clusters(
    labels,
    num_clusters=chosen_k,
    sequences_df=filtered.reset_index()   # <‚Äî FIXED
)

cluster_stats


In [None]:
# ======================================================
# 6) SOM Sweep ‚Üí Best K
# ======================================================

best_som, som_results = cu.find_num_clusters(
    X,
    K_RANGE,
    cluster_algo="som"
)

cu.plot_silhouette_trend(som_results, "SOM Silhouette Trend (TF-IDF)")

labels_som, _ = cu.cluster_traces(
    X,
    num_clusters=best_som["k"],
    cluster_algo="som"
)

cu.mine_from_clusters(
    labels_som,
    num_clusters=best_som["k"],
    sequences_df=filtered.reset_index()   # <‚Äî FIXED
)


In [None]:
# ======================================================
# 7) HDBSCAN (no K needed)
# ======================================================

labels_hdbscan, hdb_model = cu.cluster_traces(
    X,
    cluster_algo="hdbscan"
)

cluster_stats_hdb = cu.mine_from_clusters(
    labels_hdbscan,
    num_clusters=None,
    sequences_df=filtered.reset_index()   # <‚Äî FIXED
)

cluster_stats_hdb


In [None]:
# ======================================================
# 8) Attach Cluster Assignments to FILTERED
# ======================================================

filtered["cluster_kmeans_tfidf_svd"] = labels
filtered["cluster_som_tfidf_svd"]     = labels_som
filtered["cluster_hdbscan_tfidf_svd"] = labels_hdbscan

filtered.head()


# Doc2Vec Clustering

In [None]:
# ======================================================
# DOC2VEC CLUSTERING (KMeans, SOM, HDBSCAN)
# ======================================================

# Select Doc2Vec vectors
X_doc2vec = encoders["DOC2VEC"]
print("Using Doc2Vec embedding:", X_doc2vec.shape)


In [None]:
# KMeans sweep
best_km_doc, km_results_doc = cu.find_num_clusters(
    X_doc2vec,
    K_RANGE,
    cluster_algo="kmeans"
)

cu.plot_silhouette_trend(
    km_results_doc,
    "KMeans Silhouette Trend (Doc2Vec)"
)

chosen_k_doc = best_km_doc["k"]

labels_kmeans_doc2vec, model_kmeans_doc2vec = cu.cluster_traces(
    X_doc2vec,
    num_clusters=chosen_k_doc,
    cluster_algo="kmeans"
)

cu.mine_from_clusters(
    labels_kmeans_doc2vec,
    num_clusters=chosen_k_doc,
    sequences_df=filtered.reset_index()
)


In [None]:
# SOM sweep
best_som_doc, som_results_doc = cu.find_num_clusters(
    X_doc2vec,
    K_RANGE,
    cluster_algo="som"
)

cu.plot_silhouette_trend(
    som_results_doc,
    "SOM Silhouette Trend (Doc2Vec)"
)

labels_som_doc2vec, _ = cu.cluster_traces(
    X_doc2vec,
    num_clusters=best_som_doc["k"],
    cluster_algo="som"
)

cu.mine_from_clusters(
    labels_som_doc2vec,
    num_clusters=best_som_doc["k"],
    sequences_df=filtered.reset_index()
)


In [None]:
labels_hdbscan_doc2vec, hdb_model_doc2vec = cu.cluster_traces(
    X_doc2vec,
    cluster_algo="hdbscan"
)

cu.mine_from_clusters(
    labels_hdbscan_doc2vec,
    sequences_df=filtered.reset_index()
)


In [None]:
filtered["cluster_kmeans_doc2vec"]  = labels_kmeans_doc2vec
filtered["cluster_som_doc2vec"]     = labels_som_doc2vec
filtered["cluster_hdbscan_doc2vec"] = labels_hdbscan_doc2vec


# Process Discovery - Global

In [None]:
# =============================================================
# Global Process Discovery Pipeline (Annotated Version)
# =============================================================
# This script performs *global* process discovery using PM4Py.
# It discovers a single process model for the entire log (no clustering),
# evaluates conformance (fitness + precision), visualizes the resulting
# models (Petri net / BPMN / Heuristics Net), and computes variability.


# =============================================================
# 0) Imports
# =============================================================
# PM4Py evaluation
from pm4py.algo.evaluation.precision import algorithm as precision_evaluator
from pm4py.algo.evaluation.replay_fitness.algorithm import Variants as FitnessVariants
from pm4py.algo.evaluation.precision.algorithm import Variants as PrecisionVariants
import pandas as pd
import numpy as np

# Log conversion utilities
from pm4py.objects.log.util import dataframe_utils
from pm4py.objects.conversion.log import converter as log_converter

# Fitness (token-based replay) and precision
from pm4py.algo.evaluation.replay_fitness.variants import token_replay as fitness_evaluator
from pm4py.algo.evaluation.precision import algorithm as precision_evaluator
from pm4py.objects.log.util import dataframe_utils
from pm4py.objects.conversion.log import converter as log_converter
from pm4py.algo.evaluation.replay_fitness.variants import token_replay as fitness_evaluator
from pm4py.algo.evaluation.precision import algorithm as precision_evaluator

from pm4py.visualization.heuristics_net import visualizer as hn_visualizer

# =============================================================
# 1) Configuration
# =============================================================
MINER_TYPE = "inductive"  # options: "inductive", "heuristics", "alpha"
RANDOM_STATE = 42         # deterministic sampling
EVAL_SAMPLE_SIZE = 2000   # conformance evaluation log sample size

# Conformance settings
# checks *fit* (does the model reproduce behavior?)
FITNESS_VARIANT = FitnessVariants.TOKEN_BASED
# checks *specificity* (does it avoid overgeneralizing?)
PRECISION_VARIANT = PrecisionVariants.ETCONFORMANCE_TOKEN




In [None]:


def discover_model_for_miner(log):
    """
    Wrapper to call the correct PM4Py discovery algorithm based on MINER_TYPE.


    Returns:
    (net, im, fm, heu_net, process_tree)
    - heuristics miner returns heu_net instead
    - inductive miner returns process tree
    """
    if MINER_TYPE == "inductive":
        process_tree = inductive_miner.apply(log)
        net, im, fm = pt_converter.apply(process_tree)
        return net, im, fm, None, process_tree     # heuristics net = None

    elif MINER_TYPE == "alpha":
        net, im, fm = alpha_miner.apply(log)
        return net, im, fm, None, None     # heuristics net = None

    elif MINER_TYPE == "heuristics":
        heu_net = heuristics_miner.apply_heu(log)
        net, im, fm = hn_converter.apply(heu_net)
        return net, im, fm, heu_net, None

    else:
        raise ValueError(f"‚ùå Unknown MINER_TYPE: {MINER_TYPE}")

# =============================================================
# 3) Sampling Utility
# =============================================================
def maybe_sample_log(event_log, max_traces):
    """Sample a subset of traces from a log if it exceeds max_traces.

    Args:
        event_log (EventLog): PM4Py event log object.
        max_traces (int): Maximum number of traces to keep.

    Returns:
        EventLog: A sampled or original log.
    """
    if (max_traces is None) or (len(event_log) <= max_traces):
        return event_log
    idx = np.random.RandomState(RANDOM_STATE).choice(len(event_log), size=max_traces, replace=False)
    return EventLog([event_log[i] for i in sorted(idx)])

# =============================================================
# 4) Variability Measure
# =============================================================
# Variant variability reflects behavioral diversity.
def compute_variability_ratio(log):
    """
    Compute variability ratio = (# unique variants) / (# total traces).
    Lower values indicate more homogeneous clusters.

    Args:
        log (EventLog): PM4Py EventLog for the cluster.

    Returns:
        float: variability ratio
    """
    if len(log) == 0:
        return 0.0
    variant_set = set()
    for trace in log:
        seq = tuple(e["concept:name"] for e in trace)
        variant_set.add(seq)
    return len(variant_set) / len(log)

In [None]:
# This section MUST be ran before Process discovery
from pm4py.algo.evaluation.replay_fitness import algorithm as fitness_evaluator
from pm4py.algo.evaluation.precision import algorithm as precision_evaluator

from pm4py.algo.discovery.inductive import algorithm as inductive_miner
from pm4py.objects.conversion.process_tree import converter as pt_converter

from pm4py.algo.discovery.heuristics import algorithm as heuristics_miner
from pm4py.objects.conversion.heuristics_net import converter as hn_converter

from pm4py.algo.evaluation.replay_fitness.algorithm import Variants as FitnessVariants
from pm4py.algo.evaluation.precision.algorithm import Variants as PrecisionVariants

from pm4py.visualization.petri_net import visualizer as pn_visualizer

In [None]:
from pm4py.visualization.petri_net import visualizer as pn_visualizer
from pm4py.objects.conversion.process_tree import converter as process_tree_converter
from pm4py.visualization.bpmn import visualizer as bpmn_visualizer

# =============================================================
# 5) Global Process Discovery + Evaluation
# =============================================================
def evaluate_global_model(df, metrics_df=None):
    """
    Perform discovery ‚Üí visualization ‚Üí conformance evaluation for a *global* model.


    Args:
    df (pd.DataFrame): Raw event log with columns:
    - case:concept:name
    - concept:name
    - time:timestamp


    metrics_df (pd.DataFrame or None):
    ‚Äì If provided, append new results.
    ‚Äì If None, create the table.


    Returns:
    pd.DataFrame: Updated metrics table.
    """

    # ----------------------------------------------
    # Convert pandas ‚Üí PM4Py EventLog
    # ----------------------------------------------
    params = {log_converter.Variants.TO_EVENT_LOG.value.Parameters.CASE_ID_KEY: "case:concept:name"}
    global_log = log_converter.apply(df, variant=log_converter.Variants.TO_EVENT_LOG, parameters=params)

    # ----------------------------------------------
    # Model discovery
    # ----------------------------------------------
    net, im, fm, heu_net, process_tree = discover_model_for_miner(global_log)
    
    # ----------------------------------------------
    # Visualization (Petri net, BPMN, or heuristics net)
    # ----------------------------------------------
   # --- Visualization settings ---
    if MINER_TYPE == "inductive":
        VISUALIZATION_THRESHOLD = None  # set to None = no size limit

        #from pm4py.visualization.petri_net import visualizer as pn_visualizer

        print("PN size:", len(net.places), len(net.transitions))

        if VISUALIZATION_THRESHOLD is None or len(net.transitions) < VISUALIZATION_THRESHOLD:
            #print("Rendering PN...")
            #gviz = pn_visualizer.apply(net, im, fm)
            #pn_visualizer.view(gviz)
            print("Rendering BPMN...")
            bpmn_graph = process_tree_converter.apply(
            process_tree,
            variant=process_tree_converter.Variants.TO_BPMN)
            
            gviz = bpmn_visualizer.apply(bpmn_graph)
            bpmn_visualizer.view(gviz)
        else:
            print(f"PN too large ({len(net.transitions)} transitions) ‚Äî skipping visualization")
            
    elif MINER_TYPE == "heuristics":
        # heuristics nets need their own visualizer
        
        print("Rendering Heuristics Net...")
        gviz = hn_visualizer.apply(heu_net)
        hn_visualizer.view(gviz)
    # --- END ADDED BLOCK ---
    
    # ----------------------------------------------
    # Conformance checking
    # ----------------------------------------------
    # maybe sample
    eval_log = maybe_sample_log(global_log, EVAL_SAMPLE_SIZE)

    # conformance eval
    fit_res  = fitness_evaluator.apply(eval_log, net, im, fm, variant=FITNESS_VARIANT)
    fitness  = fit_res.get("average_trace_fitness", fit_res.get("perc_fit_traces", np.nan))
    precision = precision_evaluator.apply(eval_log, net, im, fm, variant=PRECISION_VARIANT)
    fscore    = 2 * (precision * fitness) / (precision + fitness) if (precision + fitness) > 0 else 0.0

    # variability
    global_variability = compute_variability_ratio(global_log)

    # ----------------------------------------------
    # Build result row
    # ----------------------------------------------
    row_dict = {
        "Method": "Global",
        "Cluster": "Global",
        "NumTraces": len(global_log),
        "Precision": float(precision),
        "Fitness": float(fitness),
        "FScore": float(fscore),
        "VariabilityRatio": float(global_variability)
    }

    # Initialize or append
    if (metrics_df is None) or (len(metrics_df) == 0):
        metrics_df = pd.DataFrame([row_dict])
    else:
        # subsequent calls
        metrics_df = pd.concat([metrics_df, pd.DataFrame([row_dict])], ignore_index=True)

    print(
        f"üåê Global Model ({MINER_TYPE}) ‚Üí "
        f"Precision: {precision:.3f}, Fitness: {fitness:.3f}, "
        f"F-score: {fscore:.3f}, Variability Ratio: {global_variability:.3f}"
    )

    return metrics_df



## When MINER_TYPE = "heuristics" 

In [None]:

metrics_df = evaluate_global_model(df)
display(metrics_df)

## When MINER_TYPE = "inductive" 

In [None]:

metrics_df = evaluate_global_model(df)
display(metrics_df)

# Process Discovery - Per Cluster

## Inductive Miner

In [None]:
# =============================================================
# Per‚ÄëCluster Process Discovery & Evaluation (Annotated Version)
# =============================================================
# This script mirrors the global process discovery workflow but applied
# separately to each cluster. For every cluster, we:
# 1. Extract all traces belonging to that cluster
# 2. Convert them to an EventLog
# 3. Discover a process model (IMf version of Inductive Miner)
# 4. Optionally visualize the model (BPMN)
# 5. Compute conformance (precision, fitness, F-score)
# 6. Compute variability ratio
# 7. Append results to a cluster‚Äëlevel metrics table


# =============================================================
# 0) Discovery Function ‚Äî IMf Variant
# =============================================================
# IMf = Inductive Miner (infrequent) ‚Äî a *more flexible* configuration.
# It captures more behavioral detail (higher fitness) at the cost of more
# complex / less generalizable models.

In [None]:
#---BALANCED-----
# Keeps most traces
# Removes some infrequent paths
# Produces a reasonably interpretable model

# --- NEW: IMf variant specifically for per-cluster discovery ---
def discover_model_for_miner_imf(log):
    """
    IMf version ‚Äî matches old pipeline behavior.
    """
    tree = inductive_miner.apply(
        log,
        variant=inductive_miner.Variants.IMf,
        parameters={"noise_threshold": 0.2}
    )
    net, im, fm = pt_converter.apply(tree)
    return net, im, fm, tree

In [None]:
#---STRICT(simpler models, less fitness)-----
# Filters weak paths aggressively
# Great for large noisy logs
# Model may underfit (i.e. lose rare but valid behavior)
#def discover_model_for_miner_imf(log):
    """
    IMf version ‚Äî matches old pipeline behavior.
    """
    #tree = inductive_miner.apply(
        #log,
        #variant=inductive_miner.Variants.IMf,
        #parameters={
            #"noise_threshold": 0.4,
            #"min_dfg_occurrences": 2
        #}
    #)
    #net, im, fm = pt_converter.apply(tree)
    #return net, im, fm, tree

In [None]:
#---FLEXIBLE(complex model, higher fitness)-----
# Almost no pruning
# captures most behavior
# Model may overfit(i.e. messy and less generalizable)
#def discover_model_for_miner_imf(log):
    """
    IMf version ‚Äî matches old pipeline behavior.
    """
    #tree = inductive_miner.apply(
        #log,
        #variant=inductive_miner.Variants.IMf,
        #parameters={
            #"noise_threshold": 0.1,
            #"min_dfg_occurrences": 1
        #}
    #)
    #net, im, fm = pt_converter.apply(tree)
    #return net, im, fm, tree

In [None]:

# =============================================================
# 1) Per‚ÄëCluster Discovery & Evaluation
# =============================================================

def discover_and_evaluate_per_cluster(
    df: pd.DataFrame,
    filtered: pd.DataFrame,
    cluster_col: str,
    method_name: str = None,
    cluster_metrics_df: pd.DataFrame = None,   # ‚Üê renamed
    skip_noise: bool = True,
    noise_label: int = -1,
    visualize: bool = False,
    outdir: str = "cluster_models",
    sample_size: int = None,
):
    """
    Discover and evaluate process models *independently for each cluster*.


    Args:
    df: Original full event log (pandas DataFrame).
    filtered: DataFrame containing case IDs + cluster assignments.
    cluster_col: Column name inside `filtered` identifying cluster labels.
    method_name: Name to insert in the metrics output (defaults to cluster_col).
    cluster_metrics_df: Existing table to append results to.
    skip_noise: If True, skip the noise cluster (e.g., HDBSCAN label -1).
    noise_label: Integer label representing noise.
    visualize: If True, render BPMN for each cluster.
    outdir: Directory for saving models (not used here but reserved).
    sample_size: Max traces to evaluate (None ‚Üí default = EVAL_SAMPLE_SIZE).


    Returns:
    Updated cluster_metrics_df containing one row per cluster.
    """
    method_name = method_name or cluster_col
    #sample_size = sample_size if sample_size is not None else EVAL_SAMPLE_SIZE
    
    # ----------------------------------------------------------
    # Validate index structure (cases must be index or a column)
    # ----------------------------------------------------------
    if filtered.index.name != "case:concept:name":
        if "case:concept:name" in filtered.columns:
            filtered = filtered.set_index("case:concept:name", drop=True)
        else:
            raise ValueError("`filtered` must have case ids on the index or a 'case:concept:name' column.")

    if cluster_col not in filtered.columns:
        raise ValueError(f"`{cluster_col}` not found in filtered columns: {list(filtered.columns)}")

    # ----------------------------------------------------------
    # Iterate over clusters
    # ----------------------------------------------------------
    for c, case_ids in filtered.groupby(cluster_col).groups.items():
        if skip_noise and c == noise_label:
            continue

        cluster_df = df[df["case:concept:name"].isin(case_ids)].copy()
        if cluster_df.empty:
            continue
        
        # Extract cluster-specific subset of df
        cluster_df = dataframe_utils.convert_timestamp_columns_in_df(cluster_df)
        
        # Standard timestamp normalization for PM4Py
        cluster_df = cluster_df.sort_values(
            ["case:concept:name","time:timestamp"],
            ignore_index=True
        )
        # Mapping for PM4Py conversion
        params = {
            "case_id":       "case:concept:name",
            "activity_key":  "concept:name",
            "timestamp_key": "time:timestamp",
        }
        
        

        log = log_converter.apply(cluster_df, variant=log_converter.Variants.TO_EVENT_LOG, parameters=params)

        # ------------------------------------------------------
        # Discover model (IMf variant for detailed structure)
        # ------------------------------------------------------
        net, im, fm, tree = discover_model_for_miner_imf(log)

        eval_log = log
        
        # ------------------------------------------------------
        # Conformance metrics
        # ------------------------------------------------------
        fit_res  = fitness_evaluator.apply(eval_log, net, im, fm, variant=FITNESS_VARIANT)
        fitness  = fit_res.get("log_fitness", None)

        precision = precision_evaluator.apply(eval_log, net, im, fm, variant=PRECISION_VARIANT)
        
        if precision is not None and fitness is not None and (precision+fitness) > 0:
            fscore    = (2 * precision * fitness / (precision + fitness))  
        else: 
            fscore = 0.0

        variability = compute_variability_ratio(log)
        
        # ------------------------------------------------------
        # Append metrics row
        # ------------------------------------------------------

        row_dict = {
            
            "Method":           method_name,
            "Cluster":          f"Cluster {c}",
            "NumTraces":        len(log),
            "Precision":        float(precision),
            "Fitness":          float(fitness),
            "FScore":           float(fscore),
            "VariabilityRatio": float(variability),
            "Miner":            "inductive"
        }

        # ‚Üê saving to the cluster-level dataframe
        if (cluster_metrics_df is None) or (len(cluster_metrics_df) == 0):
            cluster_metrics_df = pd.DataFrame([row_dict])
        else:
            cluster_metrics_df = pd.concat(
                [cluster_metrics_df, pd.DataFrame([row_dict])],
                ignore_index=True
            )

        print(
            f"‚úÖ {cluster_col} / Cluster {c} (IMf) ‚Üí "
            f"Prec {precision:.3f} | Fit {fitness:.3f} | F1 {fscore:.3f} | Var {variability:.3f} | "
            f"Traces {len(log)}"
        )
        
        # ------------------------------------------------------
        # Optional visualization
        # ------------------------------------------------------

        if visualize:
            #print(f"Rendering PN for cluster {c}...")
            #gviz = pn_visualizer.apply(net, im, fm)
            #pn_visualizer.view(gviz)
            print(f"Rendering BPMN for cluster {c}...")
            bpmn_graph = process_tree_converter.apply(
                            tree,
                            variant=process_tree_converter.Variants.TO_BPMN
            )
            gviz = bpmn_visualizer.apply(bpmn_graph)
            bpmn_visualizer.view(gviz)

    return cluster_metrics_df    # important for downstream analysis


### TFIDF Calls

In [None]:
# Kmeans + TFIDF  (FIRST)
cluster_metrics_df = discover_and_evaluate_per_cluster(
    df=df,
    filtered=filtered,
    cluster_col="cluster_kmeans_tfidf_svd",
    method_name="KMeans-TFIDF",
    cluster_metrics_df=None,          # ‚Üê When set to none, it resets the dataframe cluster_metrics_df
    skip_noise=True,
    visualize=True
)

# SOM + TFIDF
cluster_metrics_df = discover_and_evaluate_per_cluster(
    df=df,
    filtered=filtered,
    cluster_col="cluster_som_tfidf_svd",
    method_name="SOM-TFIDF",
    cluster_metrics_df=cluster_metrics_df,  # << append, not None
    skip_noise=True,
    visualize=True
)

# HDBSCAN + TFIDF
cluster_metrics_df = discover_and_evaluate_per_cluster(
    df=df,
    filtered=filtered,
    cluster_col="cluster_hdbscan_tfidf_svd",
    method_name="HDBSCAN-TFIDF",
    cluster_metrics_df=cluster_metrics_df,  # << append, not None
    skip_noise=True,
    noise_label=-1,
    visualize=True
)

display(cluster_metrics_df)


### Doc2Vec Calls

In [None]:
# KMeans + Doc2Vec
cluster_metrics_df = discover_and_evaluate_per_cluster(
    df=df,
    filtered=filtered,
    cluster_col="cluster_kmeans_doc2vec",
    method_name="KMeans-Doc2Vec",
    cluster_metrics_df=cluster_metrics_df,     # first call in this block
    skip_noise=True,
    visualize=True
)

# SOM + Doc2Vec
cluster_metrics_df = discover_and_evaluate_per_cluster(
    df=df,
    filtered=filtered,
    cluster_col="cluster_som_doc2vec",
    method_name="SOM-Doc2Vec",
    cluster_metrics_df=cluster_metrics_df,   # append to same df
    skip_noise=True,
    visualize=True
)

# HDBSCAN + Doc2Vec
cluster_metrics_df = discover_and_evaluate_per_cluster(
    df=df,
    filtered=filtered,
    cluster_col="cluster_hdbscan_doc2vec",
    method_name="HDBSCAN-Doc2Vec",
    cluster_metrics_df=cluster_metrics_df,   # append again
    skip_noise=True,
    noise_label=-1,
    visualize=True
)


display(cluster_metrics_df)


In [None]:
cluster_metrics_df.to_csv("cluster_metrics_results.csv", index=False)


In [None]:
# Use all methods / encoders
all_models = cluster_metrics_df

# Weighted by number of traces
weights = all_models["NumTraces"]

weighted_avg_all = (
    all_models[["Precision", "Fitness", "FScore"]]
    .multiply(weights, axis=0)
    .sum() / weights.sum()
)

weighted_avg_all


In [None]:
# Use ALL methods (no filtering)
all_methods = cluster_metrics_df

# Weighted averages per method (all encoders + models included)
method_weighted_all = (
    all_methods
    .groupby("Method")
    .apply(lambda g: (
        (g[["Precision","Fitness","FScore"]] * g["NumTraces"].values[:, None]).sum()
        / g["NumTraces"].sum()
    ))
)

method_weighted_all


In [None]:
trace_counts_per_method = (
    cluster_metrics_df
    .groupby("Method")["NumTraces"]
    .sum()
    .rename("TotalTraces")
)

trace_counts_per_method
