In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.cluster import KMeans
import pandas as pd
import plotly.express as px


## Clustering by physical attributes

In [None]:
phys_feat_df = pd.read_parquet("../data/processed/physical_feat.parquet")

# ------------------------ Select the physical features ------------------------
phys_cols = [
    "ks_objem", "ks_hmotnost_brutto", "ks_sirka", "longest_side",
    "is_bulky", "is_heavy", "missing_carton_info", "missing_display_info"
]


# ----------------------- Prepare feature matrix ------------------------
X = phys_feat_df[phys_cols].copy()
X = X.fillna(0)

# ----------------------- Scale features --------------------------------
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# ----------------------- Clustering -----------------------
k = 4
km = KMeans(n_clusters=k, random_state=42, n_init=10)
clusters = km.fit_predict(X_scaled)
phys_feat_df["cluster_phys"] = clusters

# ----------------------- Analyze clusters -----------------------
cluster_stats = (
    phys_feat_df.groupby("cluster_phys")["total_time"]
    .median()
    .reset_index()
    .sort_values("total_time", ascending=False)
)
print(cluster_stats)

# ----------------------- Reduce the dimensions for visualization -----------------------
pca = PCA(n_components=2)
coords = pca.fit_transform(X_scaled)

fig = px.scatter(
    x=coords[:,0], y=coords[:,1], color=clusters.astype(str),
    title="KMeans Clusters of Physical Features",
    labels={"x": "PCA 1", "y": "PCA 2", "color": "Cluster"}
)
fig.show()


In [None]:

# ----------------------- Elbow method to find optimal k ------------------------
X_phys = phys_feat_df[
    [
        "ks_objem", "ks_hmotnost_brutto", "ks_sirka", "ks_delka", "ks_vyska",
        "longest_side", "is_bulky", "is_heavy", "density", "aspect_ratio"
    ]
].fillna(0)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_phys)
inertia = []
K = range(2, 10)

for k in K:
    km = KMeans(n_clusters=k, random_state=0, n_init=10)
    km.fit(X_scaled)
    inertia.append(km.inertia_)

elbow_df = pd.DataFrame({"k": K, "inertia": inertia})

fig = px.line(
    elbow_df,
    x="k",
    y="inertia",
    markers=True,
    title="Elbow Method for Optimal k",
    labels={"k": "Number of clusters (k)", "inertia": "Sum of squares"},
)
fig.show()


In [None]:
%reset

## Clustering by location/branch

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.cluster import KMeans
import pandas as pd
import plotly.express as px

In [None]:
location_feat = pd.read_parquet("../data/processed/location_feat.parquet")

# ------------------------ Select the physical features ------------------------
loc_feats = ["pracoviste_kod", "umisteni_new", "umisteni_vyska",
             "zcz_zpusob_zprac_kod", "trasa_kod"]

Xloc = location_feat[loc_feats].astype("category")

# ------------------------ One-hot encode ----------------------------------------
enc = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
Xloc_ohe = enc.fit_transform(Xloc)

# ----------------------- Clustering -----------------------
k = 3
km_loc = KMeans(n_clusters=k, random_state=42, n_init=10)
loc_clusters = km_loc.fit_predict(Xloc_ohe)
location_feat["cluster_location"] = loc_clusters

# ----------------------- Analyze clusters -----------------------
loc_stats = (
    location_feat.groupby("cluster_location")["total_time"]
    .median()
    .reset_index()
    .sort_values("total_time", ascending=False)
)
print(loc_stats)

fig = px.box(
    location_feat,
    x="cluster_location",
    y="total_time",
    points="outliers",  # show individual slow/fast outliers
    color="cluster_location",
    title="Distribution of Picking Time by Location Cluster",
    labels={"total_time": "Picking time (minutes)", "cluster_location": "Cluster"}
)

fig.show()


In [None]:
%reset

## Clustering by the operational workflow

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.cluster import KMeans
import pandas as pd
import plotly.express as px

In [None]:
worlkload_feat_df = pd.read_parquet("../data/processed/workload_feat.parquet")

# ----------------------- Select useful workload features -----------------------
workload_feats = ["workload_proxy", "log_weight", "time_per_item", 'vaha_vyriz']

Xwork = worlkload_feat_df[workload_feats].copy()

# Drop missing values if any (or fill with median)
Xwork = Xwork.fillna(Xwork.median())

# Standardize
scaler = StandardScaler()
Xwork_scaled = scaler.fit_transform(Xwork)

# ----------------------- KMeans clustering -----------------------
k = 3
km_work = KMeans(n_clusters=k, random_state=42, n_init=10)
work_clusters = km_work.fit_predict(Xwork_scaled)
worlkload_feat_df["cluster_workload"] = work_clusters

# ----------------------- Analyze median time per cluster -----------------------
work_stats = (
    worlkload_feat_df.groupby("cluster_workload")["total_time"]
    .median()
    .reset_index()
    .sort_values("total_time", ascending=False)
)
print(work_stats)

# ----------------------- Plot boxplot of picking time per cluster -----------------------
fig1 = px.box(
    worlkload_feat_df,
    x="cluster_workload",
    y="total_time",
    points="outliers",
    color="cluster_workload",
    title="Distribution of Picking Time by Workload Cluster",
    labels={"total_time": "Picking time (minutes)", "cluster_workload": "Cluster"}
)
fig1.show()

# ----------------------- PCA 2D scatter for visualization -----------------------
pca = PCA(n_components=2)
coords = pca.fit_transform(Xwork_scaled)

fig2 = px.scatter(
    x=coords[:,0],
    y=coords[:,1],
    color=worlkload_feat_df["cluster_workload"].astype(str),
    title="Workload Feature Clusters (PCA projection)",
    labels={"x": "PCA 1", "y": "PCA 2", "color": "Cluster"}
)
fig2.show()

In [None]:
%reset

## Combined features: physical items attributes + workflow operations

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.cluster import KMeans
import pandas as pd
import plotly.express as px

In [None]:
phys_feat_df = pd.read_parquet("../data/processed/physical_feat.parquet")
worlkload_feat_df = pd.read_parquet("../data/processed/workload_feat.parquet")

phys_cols = [
    "ks_objem", "ks_hmotnost_brutto", "ks_sirka", "longest_side",
    "is_bulky", "is_heavy", "missing_carton_info", "missing_display_info"
]
workload_feats = ["workload_proxy", "log_weight", "time_per_item", 'vaha_vyriz']

combined_df = phys_feat_df[phys_cols + ['cinnost_id']].merge(
    worlkload_feat_df[workload_feats + ['cinnost_id', 'total_time']],
    on="cinnost_id",
    how="left"
)

# ------------------- Select features -------------------
combined_feats = phys_cols + workload_feats

X = combined_df[combined_feats].fillna(0)

# ------------------- Standardize -------------------
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# ------------------- KMeans clustering -------------------
k = 3
km = KMeans(n_clusters=k, random_state=42, n_init=10)
combined_clusters = km.fit_predict(X_scaled)

combined_df["cluster_phys_work"] = combined_clusters

# ------------------- Cluster summary -------------------
summary = (
    combined_df.groupby("cluster_phys_work")["total_time"]
    .median()
    .reset_index()
    .sort_values("total_time", ascending=False)
)
print(summary)

# ------------------- PCA 2D visualization -------------------
pca = PCA(n_components=2)
coords = pca.fit_transform(X_scaled)

fig = px.scatter(
    x=coords[:, 0], y=coords[:, 1],
    color=combined_df["cluster_phys_work"].astype(str),
    title="Combined Physical + Workload Feature Clusters (PCA projection)",
    labels={"x": "PCA 1", "y": "PCA 2", "color": "Cluster"}
)
fig.show()

# ------------------- Boxplot of time per cluster -------------------
fig2 = px.box(
    combined_df,
    x="cluster_phys_work",
    y="total_time",
    points="outliers",
    color="cluster_phys_work",
    title="Distribution of Picking Time by Combined Clusters",
    labels={"total_time": "Picking time (minutes)", "cluster_phys_work": "Cluster"}
)
fig2.show()


In [None]:
%reset

## Clustering by items hierarchy categories

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import pandas as pd
import plotly.express as px

In [None]:
hierarchy_feats_df = pd.read_parquet("../data/processed/hierarchy_feat.parquet")
hierarchy_filtered = hierarchy_feats_df[hierarchy_feats_df["count"] > 100].copy()

Xh = hierarchy_filtered[["speed_median", "count"]]
scaler = StandardScaler()
Xh_scaled = scaler.fit_transform(Xh)

km = KMeans(n_clusters=5, random_state=42, n_init=10)
hierarchy_filtered["cluster_hierarchy"] = km.fit_predict(Xh_scaled)

fig = px.treemap(
    hierarchy_filtered,
    path=["hierarchie_lvl_1", "hierarchie_lvl_2", "hierarchie_lvl_3",
          "hierarchie_lvl_4", "hierarchie_lvl_5"],
    values="count",  # box size = number of operations
    color="speed_median",  # box color = median picking time
    color_continuous_scale="RdYlGn_r",  # green = fast, red = slow
    title="Median Picking Time by Product Hierarchy"
)

fig.update_layout(
    margin=dict(t=40, l=0, r=0, b=0)
)
fig.show()

fig = px.treemap(
    hierarchy_filtered,
    path=["hierarchie_lvl_1", "hierarchie_lvl_2", "hierarchie_lvl_3",
          "hierarchie_lvl_4", "hierarchie_lvl_5"],
    values="count",  # box size = number of operations
    color="cluster_hierarchy",  # box color = median picking time
    color_continuous_scale="RdYlGn_r",  # green = fast, red = slow
    title="Median Picking Time by Product Hierarchy"
)

fig.update_layout(
    margin=dict(t=40, l=0, r=0, b=0)
)
fig.show()

fig = px.scatter(
    hierarchy_filtered,
    x="count",
    y="speed_median",
    color="cluster_hierarchy",
    size="count",
    hover_data=["hierarchie_lvl_1", "hierarchie_lvl_2", "hierarchie_lvl_3"],
    title="Hierarchy Categories by Speed & Frequency",
    labels={"count": "Operation count", "speed_median": "Median time (minutes)"}
)
fig.update_xaxes(type="log")  # log scale helps if some counts are huge
fig.show()