# Import Library & Setup

In [64]:
import pandas as pd
import numpy as np
from datetime import datetime
import pickle
import warnings
warnings.filterwarnings('ignore')

# Deep Learning
import tensorflow as tf
from tensorflow.keras import layers, Model, optimizers, callbacks
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization

# ML
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import (
    silhouette_score, davies_bouldin_score, calinski_harabasz_score,
    adjusted_rand_score
)
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.ensemble import RandomForestClassifier

# Viz
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)

np.random.seed(42)
tf.random.set_seed(42)

print('Libraries loaded')
print(f'Time: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')

Libraries loaded
Time: 2026-02-05 11:23:45


# Load Data

In [65]:
file = '../data/road_dataset.csv'

try:
    df = pd.read_csv(file)
    print(f'Loaded: {df.shape[0]} shoes × {df.shape[1]} columns')
    display(df.head())
except FileNotFoundError:
    print(f"WARNING: '{file}' not found.")
    print("Please upload the correct dataset file to run with actual data.")

Loaded: 430 shoes × 49 columns


Unnamed: 0,brand,name,rocker,orthotic_friendly,removable_insole,pace_competition,pace_daily_running,pace_tempo,arch_neutral,arch_stability,...,torsion_stiff,heelcounter_flexible,heelcounter_moderate,heelcounter_stiff,season_all,season_summer,season_winter,weight_lab_oz,drop_lab_mm,heel_lab_mm
0,Brooks,Launch 9,0,1,1,0,1,1,1,0,...,1,1,0,0,0,0,0,7.9,9.4,32.4
1,Brooks,Levitate 6,0,1,1,0,1,0,1,0,...,0,0,1,0,1,1,0,10.7,7.7,34.3
2,Adidas,4DFWD,0,1,1,0,1,0,1,0,...,0,1,0,0,1,0,0,11.9,8.9,33.3
3,Adidas,4DFWD 2,0,1,1,0,1,0,1,0,...,0,0,1,0,1,0,0,12.6,10.6,31.8
4,Adidas,4DFWD 3,0,1,1,0,1,0,1,0,...,0,1,0,0,1,0,0,12.3,9.9,32.6


In [66]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 430 entries, 0 to 429
Data columns (total 49 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   brand                 430 non-null    str    
 1   name                  430 non-null    str    
 2   rocker                430 non-null    int64  
 3   orthotic_friendly     430 non-null    int64  
 4   removable_insole      430 non-null    int64  
 5   pace_competition      430 non-null    int64  
 6   pace_daily_running    430 non-null    int64  
 7   pace_tempo            430 non-null    int64  
 8   arch_neutral          430 non-null    int64  
 9   arch_stability        430 non-null    int64  
 10  strike_forefoot       430 non-null    int64  
 11  strike_heel           430 non-null    int64  
 12  strike_mid            430 non-null    int64  
 13  midsole_soft          430 non-null    int64  
 14  midsole_balanced      430 non-null    int64  
 15  midsole_firm          430 non-null

# Preprocessing

In [67]:
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

binary_cols = [col for col in numeric_cols if set(df[col].unique()).issubset({0, 1})]
continuous_cols = [col for col in numeric_cols if col not in binary_cols]

print(f'Features: {len(numeric_cols)} total')
print(f'  Binary     : {len(binary_cols)}')
print(f'  Continuous : {len(continuous_cols)}')

Features: 47 total
  Binary     : 44
  Continuous : 3


In [68]:
feature_cols = numeric_cols.copy()
X = df[feature_cols]

# Separate for proper scaling
X_binary = X[binary_cols].values
X_continuous = X[continuous_cols].values

# Scale continuous to 0-1 for neural network
scaler_continuous = MinMaxScaler()
X_continuous_scaled = scaler_continuous.fit_transform(X_continuous)

# Combine
X_combined = np.concatenate([X_binary, X_continuous_scaled], axis=1)

# Also standard scaling for traditional comparison
scaler_standard = StandardScaler()
X_standard = scaler_standard.fit_transform(X)

print(f'Neural input shape: {X_combined.shape}')
print(f'Range: [{X_combined.min():.6f}, {X_combined.max():.6f}]')

Neural input shape: (430, 47)
Range: [0.000000, 1.000000]


# Auto-Encoder

## Modelling

In [69]:
# Architecture
input_dim = X_combined.shape[1]
encoding_dims = [32, 16, 8]

# Encoder
input_layer = Input(shape=(input_dim,))
x = input_layer
for dim in encoding_dims:
    x = Dense(dim, activation='relu')(x)
    x = BatchNormalization()(x)
    x = Dropout(0.2)(x)

latent = x

# Decoder
for dim in reversed(encoding_dims[:-1]):
    x = Dense(dim, activation='relu')(x)
    x = BatchNormalization()(x)
    x = Dropout(0.2)(x)

output_layer = Dense(input_dim, activation='sigmoid')(x)

autoencoder = Model(input_layer, output_layer)
encoder = Model(input_layer, latent)

autoencoder.compile(
    optimizer=optimizers.Adam(0.001),
    loss='mse',
    metrics=['mae']
)

print('Autoencoder architecture:')
autoencoder.summary()

Autoencoder architecture:


## Training

In [70]:
history = autoencoder.fit(
    X_combined, X_combined,
    epochs=200,
    batch_size=32,
    validation_split=0.2,
    callbacks=[
        callbacks.EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True),
        callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, min_lr=1e-5)
    ],
    verbose=0
)

print(f'Training done!')
print(f'Final loss: {history.history["loss"][-1]:.6f}')
print(f'  Val loss: {history.history["val_loss"][-1]:.6f}')

# Get latent representations
X_latent = encoder.predict(X_combined, verbose=0)
print(f'Latent space: {X_latent.shape} (8D embeddings)')

Training done!
Final loss: 0.115574
  Val loss: 0.113185
Latent space: (430, 8) (8D embeddings)


# Metrics Function

In [71]:
def calculate_cluster_purity(df, cluster_col, binary_cols):
    purity_by_cluster = {}
    for cid in df[cluster_col].unique():
        cdata = df[df[cluster_col] == cid]
        n = len(cdata)
        dominances = []
        for col in binary_cols:
            if col in cdata.columns:
                vc = cdata[col].value_counts()
                if len(vc) > 0:
                    dominances.append(vc.max() / n)
        purity_by_cluster[cid] = {'purity': np.mean(dominances) if dominances else 0, 'n': n}
    all_p = [v['purity'] for v in purity_by_cluster.values()]
    return {
        'by_cluster': purity_by_cluster,
        'mean_purity': np.mean(all_p),
        'min_purity': np.min(all_p),
        'max_purity': np.max(all_p)
    }

def calculate_cluster_stability(X, labels, model_func, n_iter=20):
    n = len(X)
    ari_scores = []
    for _ in range(n_iter):
        idx = np.random.choice(n, n, replace=True)
        boot_labels = model_func().fit_predict(X[idx])
        ari = adjusted_rand_score(labels[idx], boot_labels)
        ari_scores.append(ari)
    m = np.mean(ari_scores)
    return {
        'mean_ari': m,
        'std_ari': np.std(ari_scores),
        'stability_level': 'Excellent' if m > 0.8 else 'Good' if m > 0.6 else 'Moderate'
    }

def calculate_interpretability_score(df, cluster_col, binary_cols, threshold=0.75):
    scores = []
    for cid in df[cluster_col].unique():
        cdata = df[df[cluster_col] == cid]
        n = len(cdata)
        strong = sum(1 for col in binary_cols if col in cdata.columns and
                    (cdata[col].sum()/n > threshold or cdata[col].sum()/n < 1-threshold))
        scores.append(strong / len(binary_cols))
    return {'mean_interpretability': np.mean(scores), 'scores': scores}

def evaluate_clustering_comprehensive(X, labels, df_temp, model_func, binary_cols):
    sil = silhouette_score(X, labels)
    db = davies_bouldin_score(X, labels)
    ch = calinski_harabasz_score(X, labels)
    df_temp['cluster'] = labels
    purity = calculate_cluster_purity(df_temp, 'cluster', binary_cols)
    stability = calculate_cluster_stability(X, labels, model_func, 10)
    interp = calculate_interpretability_score(df_temp, 'cluster', binary_cols)

    sil_norm = (sil + 1) / 2
    db_norm = 1 / (1 + db)
    ch_norm = min(ch / 1000, 1)
    composite = (0.25*sil_norm + 0.20*db_norm + 0.15*ch_norm +
                 0.25*purity['mean_purity'] + 0.10*stability['mean_ari'] +
                 0.05*interp['mean_interpretability'])

    return {
        'silhouette': sil, 'davies_bouldin': db, 'calinski_harabasz': ch,
        'purity': purity['mean_purity'], 'stability': stability['mean_ari'],
        'interpretability': interp['mean_interpretability'], 'composite_score': composite
    }

print('Metrics functions ready')

Metrics functions ready


# Model Selection

In [72]:
import pandas as pd
from sklearn.cluster import KMeans

results = []

print('Running Deep Learning Clustering...')
for i in range(3,9):
    np.random.seed(42)
    
    model = KMeans(n_clusters=i, random_state=42, n_init=10)
    labels = model.fit_predict(X_latent)

    # Hitung Metrics (Panggil fungsi evaluasi kamu)
    metrics = evaluate_clustering_comprehensive(
        X_latent, labels, df.copy(),
        lambda: KMeans(n_clusters=i, random_state=42, n_init=10),
        binary_cols
    )

    # Simpan hasil
    results.append({
        'k': i,
        'model': model,
        'labels': labels,
        **metrics
    })
    print(f"  k={i}: Score={metrics['composite_score']:.6f}, Sil={metrics['silhouette']:.3f}")

# 2. Pilih yang Terbaik
df_results = pd.DataFrame(results)
best_config = df_results.loc[df_results['composite_score'].idxmax()]

# 3. Set Variabel Final
best_model = best_config['model']
best_labels = best_config['labels']
best_k = best_config['k']
X_for_clustering = X_latent

print(f'\nSELECTED BEST K: {best_k}')
print(f'   Score: {best_config["composite_score"]:.6f}')
print(f'   Silhouette: {best_config["silhouette"]:.6f}')

Running Deep Learning Clustering...
  k=3: Score=0.545825, Sil=0.245
  k=4: Score=0.549522, Sil=0.259
  k=5: Score=0.582773, Sil=0.264
  k=6: Score=0.570883, Sil=0.279
  k=7: Score=0.604331, Sil=0.291
  k=8: Score=0.593969, Sil=0.287

SELECTED BEST K: 7
   Score: 0.604331
   Silhouette: 0.291315


# Generate Cluster Label

In [74]:
df['cluster'] = best_labels

bin_groups = {}
for col in binary_cols:
    prefix = col.split('_')[0]
    bin_groups.setdefault(prefix, []).append(col)

rows = []
for cid in sorted(df['cluster'].unique()):
    subset = df[df['cluster'] == cid]
    n = len(subset)
    
    row = {'count': n, 'pct': f"{n/len(df)*100:.1f}%"}

    # A. Continuous Columns: Langsung ambil mean
    for col in continuous_cols:
        row[col.lower()] = round(subset[col].mean(), 2)

    # B. Binary Groups: Cari fitur paling dominan
    for prefix, cols in bin_groups.items():
        # Hitung rata-rata sekaligus untuk satu grup
        means = subset[cols].mean()
        best_col = means.idxmax()
        best_val = means.max()
        
        header = prefix.lower()
        
        # Logika Tampilan
        if len(cols) == 1:
            # Jika grup cuma 1 kolom (Yes/No), misal: rocker
            val_str = "yes" if best_val > 0.5 else "no"
        else:
            # Jika varian (hapus prefix), misal: season_all -> all
            val_str = best_col.replace(f"{prefix}_", "").lower()
            
        row[header] = f"{val_str} ({best_val*100:.0f}%)"

    rows.append(row)

# Create DataFrame & Fix Display
df_summary = pd.DataFrame(rows, index=sorted(df['cluster'].unique()))

print("Cluster Summary:")
display(df_summary)

Cluster Summary:


Unnamed: 0,count,pct,weight_lab_oz,drop_lab_mm,heel_lab_mm,rocker,orthotic,removable,pace,arch,...,toebox,heelpad,outsole,breath,width,toeboxwidth,stiff,torsion,heelcounter,season
0,41,9.5%,9.99,10.38,36.84,no (17%),yes (98%),yes (98%),daily_running (95%),stability (59%),...,decent (63%),good (68%),good (90%),moderate (44%),medium (93%),medium (85%),moderate (93%),stiff (88%),stiff (83%),all (90%)
1,48,11.2%,7.66,8.38,36.75,yes (83%),yes (71%),yes (71%),competition (71%),neutral (96%),...,bad (40%),good (52%),good (40%),breathable (83%),narrow (71%),medium (42%),stiff (85%),stiff (96%),flexible (85%),all (94%)
2,88,20.5%,9.8,7.99,38.59,yes (58%),yes (99%),yes (99%),daily_running (81%),neutral (91%),...,decent (52%),good (57%),good (84%),moderate (81%),medium (80%),medium (72%),stiff (70%),stiff (93%),moderate (56%),all (99%)
3,39,9.1%,9.6,9.09,32.53,no (15%),no (46%),no (46%),daily_running (97%),neutral (72%),...,decent (3%),decent (3%),decent (3%),warm (5%),medium (82%),wide (3%),stiff (69%),moderate (26%),stiff (18%),all (5%)
4,84,19.5%,9.61,9.04,33.39,no (27%),yes (96%),yes (96%),daily_running (90%),neutral (94%),...,bad (14%),bad (11%),good (12%),moderate (49%),narrow (82%),narrow (15%),stiff (74%),moderate (48%),moderate (37%),all (74%)
5,60,14.0%,8.72,6.44,29.26,no (12%),yes (100%),yes (100%),daily_running (80%),neutral (92%),...,bad (40%),good (40%),good (57%),breathable (55%),medium (65%),medium (45%),moderate (43%),flexible (67%),flexible (65%),all (98%)
6,70,16.3%,9.73,9.45,34.39,no (11%),yes (100%),yes (100%),daily_running (100%),neutral (89%),...,decent (63%),good (61%),good (49%),moderate (80%),medium (76%),medium (70%),moderate (74%),moderate (71%),moderate (49%),all (97%)
