# Path Config & Global Imports

In [None]:
# ======================================================
# 0) Environment Setup MUST come first
# ======================================================
import os
os.environ["OMP_NUM_THREADS"] = "1"   # MUST be before numpy/sklearn imports


# ======================================================
# 1) Imports & Global Config
# ======================================================
# Standard library
import random
from pathlib import Path

# Numerical / Data handling
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
from IPython.display import display

# ML / NLP
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from minisom import MiniSom
from hdbscan import HDBSCAN

# Reproducibility
GLOBAL_SEED = 42
random.seed(GLOBAL_SEED)
os.environ["PYTHONHASHSEED"] = str(GLOBAL_SEED)

# Project path
BASE_PATH = Path.cwd()
print("Working in:", BASE_PATH)

# Warnings cleanup
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Load clustering utilities
import cluster_utils2 as cu
print("Imported cluster_utils.py successfully.")


# Load Data and Create Trace Strings for Vectorization

In [None]:
# ======================================================
# Vectorization & Clustering Pipeline (Annotated Version)
# ======================================================
# This script loads event-log data, constructs textual variants of cases,
# encodes them using TF-IDF and Doc2Vec, reduces dimensionality when needed,
# and runs several clustering algorithms (KMeans, SOM, HDBSCAN). Metrics and
# cluster-level summaries are also computed.


# ======================================================
# 2) Load Event Log Data
# ======================================================

log_path = BASE_PATH / "df_sampled_100_cases.csv"

df = pd.read_csv(log_path)

# Parse timestamps
df["time:timestamp"] = pd.to_datetime(df["time:timestamp"], errors="coerce")
df = df.dropna(subset=["time:timestamp"])
df = df.sort_values(["case:concept:name", "time:timestamp"]).copy()

print("Log loaded with", len(df), "events.")
df.head()





## Extract Trace Sequences

In [None]:
# ======================================================
# 3) Extract trace sequences → FILTERED
# ======================================================

filtered = cu.extract_trace_sequences(
    df,
    case_col="case:concept:name",
    activity_col="concept:name",
    timestamp_col="time:timestamp",
    min_variant_freq=2
)

print("Extracted", len(filtered), "trace sequences.")

# PM4Py downstream requires case IDs as index
filtered.head()



## Vectorize trace sequences

In [None]:
# ======================================================
# 4) Vectorization (TF-IDF + SVD + Doc2Vec)
# ======================================================

encoders, artifacts = cu.vectorize(
    filtered["trace_str"],
    method="both"
)

for name, X in encoders.items():
    print(name, X.shape)

# FIXED: correct key is "tfidf_svd"
X = encoders["tfidf_svd"]
print("Using tfidf_svd:", X.shape)



## TFIDF Clustering

In [None]:
# ======================================================
# 5) KMeans Sweep → Best K
# ======================================================

K_RANGE = range(2, 16)

best_km, km_results = cu.find_num_clusters(
    X,
    K_RANGE,
    cluster_algo="kmeans"
)

best_km



In [None]:
# Plot silhouette curve
cu.plot_silhouette_trend(km_results, "KMeans Silhouette Trend (TF-IDF)")


In [None]:
# Final KMeans clustering
chosen_k = best_km["k"]

labels, model = cu.cluster_traces(
    X,
    num_clusters=chosen_k,
    cluster_algo="kmeans"
)

print("Assigned clusters:", set(labels))

cluster_stats = cu.mine_from_clusters(
    labels,
    num_clusters=chosen_k,
    sequences_df=filtered   # FIXED
)

cluster_stats



In [None]:
# ======================================================
# 6) SOM Sweep → Best K
# ======================================================

best_som, som_results = cu.find_num_clusters(
    X,
    K_RANGE,
    cluster_algo="som"
)

cu.plot_silhouette_trend(som_results, "SOM Silhouette Trend (TF-IDF)")


labels_som, _ = cu.cluster_traces(
    X,
    num_clusters=best_som["k"],
    cluster_algo="som"
)

cu.mine_from_clusters(
    labels_som,
    num_clusters=best_som["k"],
    sequences_df=filtered    # FIXED
)



In [None]:
# ======================================================
# 7) HDBSCAN (no K needed)
# ======================================================

labels_hdbscan, hdb_model = cu.cluster_traces(
    X,
    cluster_algo="hdbscan"
)

cluster_stats_hdb = cu.mine_from_clusters(
    labels_hdbscan,
    num_clusters=None,
    sequences_df=filtered     # FIXED
)

cluster_stats_hdb



In [None]:
# ======================================================
# 8) Attach Cluster Assignments to FILTERED
# ======================================================

filtered["cluster_kmeans_tfidf_svd"] = labels
filtered["cluster_som_tfidf_svd"]     = labels_som
filtered["cluster_hdbscan_tfidf_svd"] = labels_hdbscan

filtered.head()



# Doc2Vec Clustering

In [None]:
# ======================================================
# DOC2VEC CLUSTERING (KMeans, SOM, HDBSCAN)
# ======================================================

# Select Doc2Vec vectors
X_doc2vec = encoders["doc2vec"]
print("Using Doc2Vec embedding:", X_doc2vec.shape)


In [None]:
# ======================================================
# 1) KMeans sweep
# ======================================================
best_km_doc, km_results_doc = cu.find_num_clusters(
    X_doc2vec,
    K_RANGE,
    cluster_algo="kmeans"
)

cu.plot_silhouette_trend(
    km_results_doc,
    "KMeans Silhouette Trend (Doc2Vec)"
)

chosen_k_doc = best_km_doc["k"]

labels_kmeans_doc2vec, model_kmeans_doc2vec = cu.cluster_traces(
    X_doc2vec,
    num_clusters=chosen_k_doc,
    cluster_algo="kmeans"
)

cu.mine_from_clusters(
    labels_kmeans_doc2vec,
    num_clusters=chosen_k_doc,
    sequences_df=filtered       # FIXED
)

In [None]:
# ======================================================
# 2) SOM sweep
# ======================================================
best_som_doc, som_results_doc = cu.find_num_clusters(
    X_doc2vec,
    K_RANGE,
    cluster_algo="som"
)

cu.plot_silhouette_trend(
    som_results_doc,
    "SOM Silhouette Trend (Doc2Vec)"
)

labels_som_doc2vec, _ = cu.cluster_traces(
    X_doc2vec,
    num_clusters=best_som_doc["k"],
    cluster_algo="som"
)

cu.mine_from_clusters(
    labels_som_doc2vec,
    num_clusters=best_som_doc["k"],
    sequences_df=filtered        # FIXED
)


In [None]:
# ======================================================
# 3) HDBSCAN
# ======================================================
labels_hdbscan_doc2vec, hdb_model_doc2vec = cu.cluster_traces(
    X_doc2vec,
    cluster_algo="hdbscan"
)

cu.mine_from_clusters(
    labels_hdbscan_doc2vec,
    sequences_df=filtered         # FIXED
)


In [None]:
# ======================================================
# 4) Attach cluster labels
# ======================================================
filtered["cluster_kmeans_doc2vec"]  = labels_kmeans_doc2vec
filtered["cluster_som_doc2vec"]     = labels_som_doc2vec
filtered["cluster_hdbscan_doc2vec"] = labels_hdbscan_doc2vec

filtered.head()

# Process Discovery - Global

In [None]:
import cluster_utils2
#from importlib import reload
#reload(cluster_utils2)
from cluster_utils2 import evaluate_global_model


In [None]:
#import cluster_utils2
cluster_utils2.MINER_TYPE = "inductive"     # or "alpha" or "heuristics"


### When MINER_TYPE = "heuristics" 

In [None]:
metrics_df = evaluate_global_model(df)
display(metrics_df)

### When MINER_TYPE = "inductive" 

In [None]:
metrics_df = evaluate_global_model(df)
display(metrics_df)

# Process Discovery - Per Cluster

## Inductive Miner

In [None]:
#import cluster_utils2
#from importlib import reload
#reload(cluster_utils2)

In [None]:
from cluster_utils2 import discover_and_evaluate_per_cluster

### TFIDF Calls

In [None]:
# Kmeans + TFIDF  (FIRST)
cluster_metrics_df = discover_and_evaluate_per_cluster(
    df=df,
    filtered=filtered,
    cluster_col="cluster_kmeans_tfidf_svd",
    method_name="KMeans-TFIDF",
    cluster_metrics_df=None,
    skip_noise=True,
    visualize=True,
    model_type="tree"     # ← ADD THIS
)

# SOM + TFIDF
cluster_metrics_df = discover_and_evaluate_per_cluster(
    df=df,
    filtered=filtered,
    cluster_col="cluster_som_tfidf_svd",
    method_name="SOM-TFIDF",
    cluster_metrics_df=cluster_metrics_df,  # << append, not None
    skip_noise=True,
    visualize=True,
    model_type="tree"
)

# HDBSCAN + TFIDF
cluster_metrics_df = discover_and_evaluate_per_cluster(
    df=df,
    filtered=filtered,
    cluster_col="cluster_hdbscan_tfidf_svd",
    method_name="HDBSCAN-TFIDF",
    cluster_metrics_df=cluster_metrics_df,  # << append, not None
    skip_noise=True,
    noise_label=-1,
    visualize=True,
    model_type="tree"
)

display(cluster_metrics_df)

### Doc2Vec Calls

In [None]:
# KMeans + Doc2Vec
cluster_metrics_df = discover_and_evaluate_per_cluster(
    df=df,
    filtered=filtered,
    cluster_col="cluster_kmeans_doc2vec",
    method_name="KMeans-Doc2Vec",
    cluster_metrics_df=cluster_metrics_df,     # first call in this block
    skip_noise=True,
    visualize=True,
    model_type="tree"
)

# SOM + Doc2Vec
cluster_metrics_df = discover_and_evaluate_per_cluster(
    df=df,
    filtered=filtered,
    cluster_col="cluster_som_doc2vec",
    method_name="SOM-Doc2Vec",
    cluster_metrics_df=cluster_metrics_df,   # append to same df
    skip_noise=True,
    visualize=True,
    model_type="tree"
)

# HDBSCAN + Doc2Vec
cluster_metrics_df = discover_and_evaluate_per_cluster(
    df=df,
    filtered=filtered,
    cluster_col="cluster_hdbscan_doc2vec",
    method_name="HDBSCAN-Doc2Vec",
    cluster_metrics_df=cluster_metrics_df,   # append again
    skip_noise=True,
    noise_label=-1,
    visualize=True,
    model_type="tree"
)


display(cluster_metrics_df)


In [None]:
cluster_metrics_df.to_csv("cluster_metrics_results.csv", index=False)
