# Import Library & Setup

In [5]:
import os
import random
import math
import pandas as pd
import numpy as np
from datetime import datetime
import pickle
import warnings
warnings.filterwarnings('ignore')

# Deep Learning
import tensorflow as tf
from tensorflow.keras import layers, Model, optimizers, callbacks
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization

# ML
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import (
    silhouette_score, davies_bouldin_score, calinski_harabasz_score,
    adjusted_rand_score
)
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.ensemble import RandomForestClassifier

# Viz
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)

np.random.seed(42)
tf.random.set_seed(42)

print('Libraries loaded')
print(f'Time: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')

Libraries loaded
Time: 2026-02-10 09:15:55


# Load Data

In [6]:
file = '../../data/road_dataset.csv'

try:
    df = pd.read_csv(file)
    print(f'Loaded: {df.shape[0]} shoes × {df.shape[1]} columns')
    display(df.head())
except FileNotFoundError:
    print(f"WARNING: '{file}' not found.")
    print("Please upload the correct dataset file to run with actual data.")

Loaded: 434 shoes × 45 columns


Unnamed: 0,brand,name,lightweight,rocker,orthotic_friendly,removable_insole,pace_daily_running,pace_tempo,pace_competition,arch_neutral,...,heel_stiff_flexible,heel_stiff_moderate,heel_stiff_stiff,plate_rock,plate_carbon,heel_lab_mm,forefoot_lab_mm,season_summer,season_winter,season_all
0,brooks,launch 9,1,0,1,1,1,1,0,1,...,1,0,0,0,0,32.4,23.0,0,0,0
1,brooks,levitate 6,0,0,1,1,1,0,0,1,...,0,1,0,0,0,34.3,26.6,1,0,1
2,adidas,4dfwd,0,0,1,1,1,0,0,1,...,1,0,0,0,0,33.3,24.4,0,0,1
3,adidas,4dfwd 2,0,0,1,1,1,0,0,1,...,0,1,0,0,0,31.8,21.2,0,0,1
4,adidas,4dfwd 3,0,0,1,1,1,0,0,1,...,1,0,0,0,0,32.6,22.7,0,0,1


In [7]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 434 entries, 0 to 433
Data columns (total 45 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   brand                434 non-null    str    
 1   name                 434 non-null    str    
 2   lightweight          434 non-null    int64  
 3   rocker               434 non-null    int64  
 4   orthotic_friendly    434 non-null    int64  
 5   removable_insole     434 non-null    int64  
 6   pace_daily_running   434 non-null    int64  
 7   pace_tempo           434 non-null    int64  
 8   pace_competition     434 non-null    int64  
 9   arch_neutral         434 non-null    int64  
 10  arch_stability       434 non-null    int64  
 11  weight_lab_oz        434 non-null    float64
 12  drop_lab_mm          434 non-null    float64
 13  strike_heel          434 non-null    int64  
 14  strike_mid           434 non-null    int64  
 15  strike_forefoot      434 non-null    int64  
 16  s

# Preprocessing

In [8]:
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

binary_cols = [col for col in numeric_cols if set(df[col].unique()).issubset({0, 1})]
continuous_cols = [col for col in numeric_cols if col not in binary_cols]

print(f'Features: {len(numeric_cols)} total')
print(f'  Binary     : {len(binary_cols)}')
print(f'  Continuous : {len(continuous_cols)}')

Features: 43 total
  Binary     : 35
  Continuous : 8


In [9]:
feature_cols = numeric_cols.copy()
X = df[feature_cols]

# Separate for proper scaling
X_binary = X[binary_cols].values
X_continuous = X[continuous_cols].values

# Scale continuous to 0-1 for neural network
scaler_continuous = MinMaxScaler()
X_continuous_scaled = scaler_continuous.fit_transform(X_continuous)

# Combine
X_combined = np.concatenate([X_binary, X_continuous_scaled], axis=1)

# Also standard scaling for traditional comparison
scaler_standard = StandardScaler()
X_standard = scaler_standard.fit_transform(X)

print(f'Neural input shape: {X_combined.shape}')
print(f'Range: [{X_combined.min():.6f}, {X_combined.max():.6f}]')

Neural input shape: (434, 43)
Range: [0.000000, 1.000000]


# Auto-Encoder

## Modelling

In [10]:
# Architecture
input_dim = X_combined.shape[1]
encoding_dims = [32, 16, 8]

# Encoder
input_layer = Input(shape=(input_dim,))
x = input_layer
for dim in encoding_dims:
    x = Dense(dim, activation='relu')(x)
    x = BatchNormalization()(x)
    x = Dropout(0.2)(x)

latent = x

# Decoder
for dim in reversed(encoding_dims[:-1]):
    x = Dense(dim, activation='relu')(x)
    x = BatchNormalization()(x)
    x = Dropout(0.2)(x)

output_layer = Dense(input_dim, activation='sigmoid')(x)

autoencoder = Model(input_layer, output_layer)
encoder = Model(input_layer, latent)

autoencoder.compile(
    optimizer=optimizers.Adam(0.001),
    loss='mse',
    metrics=['mae']
)

print('Autoencoder architecture:')
autoencoder.summary()

Autoencoder architecture:


## Training

In [11]:
history = autoencoder.fit(
    X_combined, X_combined,
    epochs=200,
    batch_size=32,
    validation_split=0.2,
    callbacks=[
        callbacks.EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True),
        callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, min_lr=1e-5)
    ],
    verbose=0
)

print(f'Training done!')
print(f'Final loss: {history.history["loss"][-1]:.6f}')
print(f'  Val loss: {history.history["val_loss"][-1]:.6f}')

# Get latent representations
X_latent = encoder.predict(X_combined, verbose=0)
print(f'Latent space: {X_latent.shape} (8D embeddings)')

Training done!
Final loss: 0.099724
  Val loss: 0.084650
Latent space: (434, 8) (8D embeddings)


# Metrics Function

In [12]:
def calculate_cluster_purity(df, cluster_col, binary_cols):
    """
    Calculates the purity of each cluster based on binary features.
    Purity is defined as the mean dominance of the most frequent value (0 or 1)
    within each binary column for a given cluster.

    Args:
        df (pd.DataFrame): The DataFrame containing data and cluster assignments.
        cluster_col (str): The name of the column in df that contains cluster labels.
        binary_cols (list): A list of column names in df that are binary features.

    Returns:
        dict: A dictionary containing:
            - 'by_cluster': A dictionary with purity and count for each cluster.
            - 'mean_purity': The average purity across all clusters.
            - 'min_purity': The minimum purity among all clusters.
            - 'max_purity': The maximum purity among all clusters.
    """
    purity_by_cluster = {}
    for cid in df[cluster_col].unique():
        cdata = df[df[cluster_col] == cid]
        n = len(cdata)
        dominances = []
        for col in binary_cols:
            if col in cdata.columns:
                vc = cdata[col].value_counts()
                if len(vc) > 0:
                    dominances.append(vc.max() / n)
        purity_by_cluster[cid] = {'purity': np.mean(dominances) if dominances else 0, 'n': n}
    all_p = [v['purity'] for v in purity_by_cluster.values()]
    return {
        'by_cluster': purity_by_cluster,
        'mean_purity': np.mean(all_p),
        'min_purity': np.min(all_p),
        'max_purity': np.max(all_p)
    }

def calculate_cluster_stability(X, labels, model_func, n_iter=20):
    """
    Calculates the stability of clustering using the Adjusted Rand Index (ARI).
    It performs bootstrapping by re-sampling the data and re-clustering to measure
    how consistent the cluster assignments are.

    Args:
        X (np.ndarray): The feature matrix used for clustering.
        labels (np.ndarray): The original cluster labels from the initial clustering.
        model_func (callable): A function that returns a new, untrained clustering model
                                (e.g., `lambda: KMeans(n_clusters=k)`).
        n_iter (int, optional): The number of bootstrap iterations. Defaults to 20.

    Returns:
        dict: A dictionary containing:
            - 'mean_ari': The mean Adjusted Rand Index.
            - 'std_ari': The standard deviation of the ARI scores.
            - 'stability_level': A categorical label (Excellent, Good, Moderate)
                                 based on the mean ARI.
    """
    n = len(X)
    ari_scores = []
    for _ in range(n_iter):
        idx = np.random.choice(n, n, replace=True)
        # Ensure model_func returns a new, untrained model each time
        boot_model = model_func()
        boot_labels = boot_model.fit_predict(X[idx])
        ari = adjusted_rand_score(labels[idx], boot_labels)
        ari_scores.append(ari)
    m = np.mean(ari_scores)
    return {
        'mean_ari': m,
        'std_ari': np.std(ari_scores),
        'stability_level': 'Excellent' if m > 0.8 else 'Good' if m > 0.6 else 'Moderate'
    }

def calculate_interpretability_score(df, cluster_col, binary_cols, threshold=0.75):
    """
    Calculates an interpretability score for each cluster.
    A cluster is considered more interpretable if a high proportion of its members
    strongly exhibit (or strongly do not exhibit) certain binary features.

    Args:
        df (pd.DataFrame): The DataFrame containing data and cluster assignments.
        cluster_col (str): The name of the column in df that contains cluster labels.
        binary_cols (list): A list of column names in df that are binary features.
        threshold (float, optional): The threshold for defining strong exhibition.
                                     A feature is 'strong' if its mean in a cluster
                                     is > threshold or < (1 - threshold). Defaults to 0.75.

    Returns:
        dict: A dictionary containing:
            - 'mean_interpretability': The average interpretability score across all clusters.
            - 'scores': A list of interpretability scores for each cluster.
    """
    scores = []
    for cid in df[cluster_col].unique():
        cdata = df[df[cluster_col] == cid]
        n = len(cdata)
        strong = sum(1 for col in binary_cols if col in cdata.columns and
                    (cdata[col].sum()/n > threshold or cdata[col].sum()/n < 1-threshold))
        # Score is the proportion of binary features that are 'strong' for this cluster
        scores.append(strong / len(binary_cols))
    return {'mean_interpretability': np.mean(scores), 'scores': scores}

def evaluate_clustering_comprehensive(X, labels, df_temp, model_func, binary_cols):
    """
    Performs a comprehensive evaluation of clustering results using multiple metrics.
    It calculates Silhouette, Davies-Bouldin, Calinski-Harabasz scores, as well as
    custom purity, stability, and interpretability scores.
    A composite score is then calculated based on a weighted average of normalized metrics.

    Args:
        X (np.ndarray): The feature matrix used for clustering.
        labels (np.ndarray): The cluster labels generated by the clustering algorithm.
        df_temp (pd.DataFrame): A temporary DataFrame, copy of the original, to add cluster labels.
        model_func (callable): A function that returns a new, untrained clustering model
                                (used for stability calculation).
        binary_cols (list): A list of column names in df_temp that are binary features.

    Returns:
        dict: A dictionary containing various evaluation metrics and a composite score:
            - 'silhouette': Silhouette Score.
            - 'davies_bouldin': Davies-Bouldin Score.
            - 'calinski_harabasz': Calinski-Harabasz Score.
            - 'purity': Mean cluster purity.
            - 'stability': Mean Adjusted Rand Index from stability testing.
            - 'interpretability': Mean cluster interpretability score.
            - 'composite_score': A weighted composite score of normalized metrics.
    """
    sil = silhouette_score(X, labels)
    db = davies_bouldin_score(X, labels)
    ch = calinski_harabasz_score(X, labels)
    df_temp['cluster'] = labels
    purity = calculate_cluster_purity(df_temp, 'cluster', binary_cols)
    stability = calculate_cluster_stability(X, labels, model_func, 10)
    interp = calculate_interpretability_score(df_temp, 'cluster', binary_cols)

    # Normalize scores for composite calculation
    sil_norm = (sil + 1) / 2
    db_norm = 1 / (1 + db)
    ch_norm = min(ch / 1000, 1)

    # Composite score with example weights
    composite = (0.25*sil_norm + 0.20*db_norm + 0.15*ch_norm +
                 0.25*purity['mean_purity'] + 0.10*stability['mean_ari'] +
                 0.05*interp['mean_interpretability'])

    return {
        'silhouette': sil, 'davies_bouldin': db, 'calinski_harabasz': ch,
        'purity': purity['mean_purity'], 'stability': stability['mean_ari'],
        'interpretability': interp['mean_interpretability'], 'composite_score': composite
    }

print('Metrics functions ready')

Metrics functions ready


# Model Selection

In [13]:
results = []

# Header Tabel
print(f"| {'K':^3} | {'Score':^8} | {'Sil.':^8} | {'DB':^8} | {'CH':^10} | {'Purity':^8} | {'Stab.':^8} | {'Interp':^8} |")
print(f"|{'-'*5}+{'-'*10}+{'-'*10}+{'-'*10}+{'-'*12}+{'-'*10}+{'-'*10}+{'-'*10}|")

for i in range(3, 10):
    np.random.seed(42)
    
    model_factory = lambda: KMeans(n_clusters=i, random_state=42, n_init=10)
    model = model_factory()
    labels = model.fit_predict(X_latent)

    metrics = evaluate_clustering_comprehensive(
        X_latent, labels, df.copy(),
        model_factory,
        binary_cols
    )

    # Simpan hasil
    results.append({
        'k': i,
        'model': model,
        'labels': labels,
        **metrics
    })

    # Print Baris Tabel
    print(f"| {i:^3} | {metrics['composite_score']:<8.6f} | {metrics['silhouette']:<6.6f} | "
          f"{metrics['davies_bouldin']:<6.6f} | {metrics['calinski_harabasz']:<8.6f} | "
          f"{metrics['purity']:<6.6f} | {metrics['stability']:<6.6f} | {metrics['interpretability']:<6.6f} |")

|  K  |  Score   |   Sil.   |    DB    |     CH     |  Purity  |  Stab.   |  Interp  |
|-----+----------+----------+----------+------------+----------+----------+----------|
|  3  | 0.536636 | 0.225810 | 1.574971 | 131.965019 | 0.788665 | 0.602066 | 0.571429 |
|  4  | 0.580353 | 0.248853 | 1.415132 | 133.806262 | 0.815095 | 0.836621 | 0.678571 |
|  5  | 0.579434 | 0.257840 | 1.282359 | 130.563510 | 0.823408 | 0.725668 | 0.731429 |
|  6  | 0.591063 | 0.267566 | 1.217340 | 128.303697 | 0.829408 | 0.796312 | 0.723810 |
|  7  | 0.592214 | 0.269408 | 1.216540 | 125.323500 | 0.836194 | 0.793381 | 0.722449 |
|  8  | 0.585126 | 0.273552 | 1.181090 | 119.882867 | 0.837608 | 0.702429 | 0.732143 |
|  9  | 0.576803 | 0.272612 | 1.191398 | 117.088025 | 0.840528 | 0.628921 | 0.717460 |


In [14]:
df_results = pd.DataFrame(results)
best_config = df_results.loc[df_results['composite_score'].idxmax()]

best_model = best_config['model']
best_labels = best_config['labels']
best_k = best_config['k']
X_for_clustering = X_latent

print(f'SELECTED BEST K: {best_k}')
print(f'   Silhouette      : {best_config["silhouette"]:.6f}')
print(f'   Composite Score : {best_config["composite_score"]:.6f}')

SELECTED BEST K: 7
   Silhouette      : 0.269408
   Composite Score : 0.592214


# Binning

In [15]:
for col in df.select_dtypes('float64').columns.tolist():
    new_col_name = col + '_bin'
    df[new_col_name] = pd.qcut(df[col], q=3, labels=[0, 1, 2]).astype(int)

# Reorder columns: non-numeric, binary, then continuous with their bins, then cluster
non_numeric_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()

new_column_order = []

# non-numeric columns
for col in non_numeric_cols:
    if col in df.columns:
        new_column_order.append(col)
# binary columns
for col in binary_cols:
    if col in df.columns:
        new_column_order.append(col)
# continuous columns and their corresponding bin columns
for col in continuous_cols:
    if col in df.columns:
        new_column_order.append(col)
    bin_col_name = col + '_bin'
    if bin_col_name in df.columns:
        new_column_order.append(bin_col_name)

# Add the 'cluster' column
if 'cluster' in df.columns and 'cluster' not in new_column_order:
    new_column_order.append('cluster')

# Reindex the DataFrame with the new order
df = df[new_column_order]

In [16]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 434 entries, 0 to 433
Data columns (total 49 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   brand                434 non-null    str    
 1   name                 434 non-null    str    
 2   lightweight          434 non-null    int64  
 3   rocker               434 non-null    int64  
 4   orthotic_friendly    434 non-null    int64  
 5   removable_insole     434 non-null    int64  
 6   pace_daily_running   434 non-null    int64  
 7   pace_tempo           434 non-null    int64  
 8   pace_competition     434 non-null    int64  
 9   arch_neutral         434 non-null    int64  
 10  arch_stability       434 non-null    int64  
 11  strike_heel          434 non-null    int64  
 12  strike_mid           434 non-null    int64  
 13  strike_forefoot      434 non-null    int64  
 14  softness_soft        434 non-null    int64  
 15  softness_balanced    434 non-null    int64  
 16  s

# Generate Cluster Label

In [17]:
# Masukkan Cluster ke DataFrame
df['cluster'] = best_labels 

# Setup Grouping
bin_groups = {}
for col in binary_cols:
    parts = col.split('_')
    
    if len(parts) > 1:
        prefix = '_'.join(parts[:-1])
    else:
        prefix = col
        
    bin_groups.setdefault(prefix, []).append(col)

# Build Summary Data
rows = []
for cid in sorted(df['cluster'].unique()):
    subset = df[df['cluster'] == cid]
    n = len(subset)
    
    row = {'count': n, 'percentage': f"{n/len(df)*100:.1f}%"}

    # A. Continuous Columns: Langsung ambil mean
    for col in continuous_cols:
        row[col.lower()] = round(subset[col].mean(), 2)

    # B. Binary Groups
    for prefix, cols in bin_groups.items():
        # Hitung mean grup ini
        means = subset[cols].mean()
        best_col = means.idxmax()
        best_val = means.max()
        
        # Case 1: Multiple Variants
        if len(cols) > 1:
            header = prefix.lower()
            val_str = best_col.replace(f"{prefix}_", "").lower()
            row[header] = f"{val_str} ({best_val*100:.0f}%)"
            
        # Case 2: Standalone
        else:
            header = cols[0].lower()
            val_str = "yes" if best_val > 0.5 else "no"
            row[header] = f"{val_str} ({best_val*100:.0f}%)"

    rows.append(row)

# Create DataFrame & Fix Display
df_summary = pd.DataFrame(rows, index=sorted(df['cluster'].unique()))
df_summary.index.name = None 

print("Cluster Summary:")
display(df_summary)

Cluster Summary:


Unnamed: 0,count,percentage,weight_lab_oz,drop_lab_mm,toebox_durability,heel_durability,outsole_durability,breathability,heel_lab_mm,forefoot_lab_mm,...,arch,strike,softness,width,toebox,stiffness,torsional,heel_stiff,plate,season
0,26,6.0%,9.71,9.89,0.0,0.0,0.0,0.0,33.38,23.49,...,neutral (62%),heel (73%),firm (4%),medium (65%),narrow (0%),stiff (85%),stiff (19%),stiff (15%),rock (0%),summer (0%)
1,57,13.1%,8.09,6.14,2.74,2.96,3.04,3.75,28.85,22.73,...,neutral (95%),mid (96%),soft (56%),medium (60%),medium (47%),moderate (65%),flexible (56%),flexible (60%),carbon (2%),all (98%)
2,60,13.8%,7.82,8.47,2.0,2.67,2.42,3.97,37.64,29.17,...,neutral (100%),mid (80%),soft (58%),narrow (65%),medium (43%),stiff (90%),stiff (95%),flexible (85%),carbon (87%),all (93%)
3,61,14.1%,10.12,9.48,3.18,3.38,3.79,3.18,37.1,27.63,...,stability (54%),heel (84%),soft (49%),medium (87%),medium (66%),moderate (85%),stiff (90%),stiff (87%),carbon (2%),all (93%)
4,72,16.6%,9.67,8.23,0.08,0.11,0.0,2.0,32.45,24.22,...,neutral (93%),mid (78%),balanced (38%),narrow (60%),narrow (8%),stiff (69%),moderate (50%),flexible (35%),rock (0%),all (58%)
5,89,20.5%,9.96,10.13,2.78,3.13,3.12,3.17,34.4,24.27,...,neutral (93%),heel (87%),balanced (81%),medium (61%),medium (63%),moderate (48%),moderate (53%),moderate (42%),carbon (2%),all (94%)
6,69,15.9%,9.87,7.8,3.06,3.52,3.78,3.29,37.99,30.2,...,neutral (90%),mid (88%),soft (72%),medium (80%),medium (86%),stiff (70%),stiff (78%),moderate (77%),carbon (6%),all (100%)


# Deep Learn Recommender

In [18]:
def get_priority_val(user_input, priority_list, mapping_dicts):
    """
    Mengambil nilai dari source dengan prioritas tertinggi yang diisi user.
    Jika input prioritas tinggi ada, abaikan input prioritas rendah.
    """
    for source_key in priority_list:
        if source_key in user_input and user_input[source_key]:
            user_choice = user_input[source_key]
            if source_key in mapping_dicts:
                mapping = mapping_dicts[source_key]
                if user_choice in mapping:
                    return mapping[user_choice]
    return 0.5  # Default Neutral

def preprocess_user_input_with_mask(user_input, binary_cols, continuous_cols):
    """
    Return:
    1. full_vector_raw: Vector lengkap (isi 0.5 jika kosong) -> Untuk masuk ke Autoencoder.
    2. valid_indices: Index kolom yang BENAR-BENAR diisi user -> Untuk perhitungan Similarity score.
    """
    feats = {col: 0.0 for col in binary_cols + continuous_cols}
    
    # MAPPINGS (Normalized 0, 0.5, 1)
    map_pace_light = {'Easy': 0.5, 'Steady': 0.5, 'Fast': 1.0}
    map_pace_drop  = {'Easy': 1.0, 'Steady': 0.5, 'Fast': 0.0} # Easy=High Drop

    # 1. SIMPLE FEATURES (1 Source)
    feats['lightweight'] = get_priority_val(user_input, ['pace'], {'pace': map_pace_light})
    feats['rocker'] = get_priority_val(user_input, ['running_purpose'], {'running_purpose': {'Race': 1.0, 'Tempo': 0.5, 'Daily': 0.0}})
    
    orth_val = get_priority_val(user_input, ['orthotic_usage'], {'orthotic_usage': {'Yes': 1.0, 'No': 0.5}})
    feats['orthotic_friendly'] = orth_val
    feats['removable_insole'] = orth_val
    
    purp = user_input.get('running_purpose', 'Daily')
    feats['pace_daily_running'] = 1.0 if purp == 'Daily' else (0.5 if purp == 'Tempo' else 0.0)
    feats['pace_tempo']         = 1.0 if purp == 'Tempo' else 0.5
    feats['pace_competition']   = 1.0 if purp == 'Race' else (0.5 if purp == 'Tempo' else 0.0)

    feats['arch_neutral']   = get_priority_val(user_input, ['arch_type'], {'arch_type': {'Low': 0.0, 'Neutral': 0.8, 'High': 1.0}})
    feats['arch_stability'] = get_priority_val(user_input, ['arch_type'], {'arch_type': {'Low': 1.0, 'Neutral': 0.2, 'High': 0.0}})
    
    feats['drop_lab_mm'] = get_priority_val(user_input, ['pace'], {'pace': map_pace_drop})

    # 2. PRIORITY OVERWRITE FEATURES
    # Strike Pattern > Pace
    prio_strike = ['strike_pattern', 'pace']
    feats['strike_heel'] = get_priority_val(user_input, prio_strike, {'strike_pattern': {'Heel': 1.0, 'Mid': 0.5, 'Forefoot': 0.0}, 'pace': {'Easy': 1.0, 'Steady': 0.5, 'Fast': 0.0}})
    feats['strike_mid'] = get_priority_val(user_input, prio_strike, {'strike_pattern': {'Heel': 0.5, 'Mid': 1.0, 'Forefoot': 0.5}, 'pace': {'Easy': 0.5, 'Steady': 1.0, 'Fast': 0.5}})
    feats['strike_forefoot'] = get_priority_val(user_input, prio_strike, {'strike_pattern': {'Heel': 0.0, 'Mid': 0.0, 'Forefoot': 1.0}, 'pace': {'Easy': 0.0, 'Steady': 0.5, 'Fast': 1.0}})

    # Cushion > Pace
    prio_soft = ['cushion_preferences', 'pace']
    feats['softness_soft'] = get_priority_val(user_input, prio_soft, {'cushion_preferences': {'Soft': 1.0, 'Balanced': 0.5, 'Firm': 0.0}, 'pace': {'Easy': 1.0, 'Steady': 0.5, 'Fast': 0.0}})
    feats['softness_balanced'] = get_priority_val(user_input, prio_soft, {'cushion_preferences': {'Soft': 0.5, 'Balanced': 1.0, 'Firm': 0.5}, 'pace': {'Easy': 0.5, 'Steady': 1.0, 'Fast': 0.0}})
    feats['softness_firm'] = get_priority_val(user_input, prio_soft, {'cushion_preferences': {'Soft': 0.0, 'Balanced': 0.5, 'Firm': 1.0}, 'pace': {'Easy': 0.0, 'Steady': 0.5, 'Fast': 1.0}})

    # Stability > Width
    prio_width = ['stability_need', 'foot_width']
    feats['width_narrow'] = get_priority_val(user_input, prio_width, {'stability_need': {'Neutral': 0.5, 'Guided': 1.0}, 'foot_width': {'Narrow': 1.0, 'Medium': 0.0, 'Wide': 0.0}})
    feats['width_medium'] = get_priority_val(user_input, prio_width, {'stability_need': {'Neutral': 0.5, 'Guided': 0.5}, 'foot_width': {'Narrow': 0.5, 'Medium': 1.0, 'Wide': 0.0}})
    feats['width_wide'] = get_priority_val(user_input, prio_width, {'stability_need': {'Neutral': 0.5, 'Guided': 0.0}, 'foot_width': {'Narrow': 0.0, 'Medium': 0.5, 'Wide': 1.0}})

    # Toebox (Stability Only)
    feats['toebox_narrow'] = get_priority_val(user_input, ['stability_need'], {'stability_need': {'Neutral': 0.5, 'Guided': 1.0}})
    feats['toebox_medium'] = get_priority_val(user_input, ['stability_need'], {'stability_need': {'Neutral': 0.5, 'Guided': 0.5}})
    feats['toebox_wide']   = get_priority_val(user_input, ['stability_need'], {'stability_need': {'Neutral': 0.5, 'Guided': 0.0}})

    # Stiffness: Arch > Pace > Purpose
    prio_stiff = ['arch_type', 'pace', 'running_purpose']
    feats['stiffness_flexible'] = get_priority_val(user_input, prio_stiff, {'arch_type': {'Low': 0.0, 'Neutral': 0.5, 'High': 0.5}, 'pace': {'Easy': 1.0, 'Steady': 0.5, 'Fast': 0.0}, 'running_purpose': {'Daily': 1.0, 'Tempo': 0.5, 'Race': 0.0}})
    feats['stiffness_moderate'] = get_priority_val(user_input, prio_stiff, {'arch_type': {'Low': 1.0, 'Neutral': 0.5, 'High': 0.5}, 'pace': {'Easy': 0.5, 'Steady': 1.0, 'Fast': 0.5}, 'running_purpose': {'Daily': 0.5, 'Tempo': 1.0, 'Race': 0.5}})
    feats['stiffness_stiff'] = get_priority_val(user_input, prio_stiff, {'arch_type': {'Low': 1.0, 'Neutral': 0.5, 'High': 0.5}, 'pace': {'Easy': 0.0, 'Steady': 0.5, 'Fast': 1.0}, 'running_purpose': {'Daily': 0.0, 'Tempo': 0.5, 'Race': 1.0}})

    # Torsional: Arch > Pace
    prio_tor = ['arch_type', 'pace']
    feats['torsional_flexible'] = get_priority_val(user_input, prio_tor, {'arch_type': {'Low': 0.0, 'Neutral': 0.5, 'High': 0.5}, 'pace': {'Easy': 1.0, 'Steady': 0.5, 'Fast': 0.0}})
    feats['torsional_moderate'] = get_priority_val(user_input, prio_tor, {'arch_type': {'Low': 0.5, 'Neutral': 0.5, 'High': 0.5}, 'pace': {'Easy': 0.5, 'Steady': 1.0, 'Fast': 0.5}})
    feats['torsional_stiff'] = get_priority_val(user_input, prio_tor, {'arch_type': {'Low': 1.0, 'Neutral': 0.5, 'High': 0.5}, 'pace': {'Easy': 0.0, 'Steady': 0.5, 'Fast': 1.0}})

    # Heel Stiff (Arch Only)
    feats['heel_stiff_flexible'] = get_priority_val(user_input, ['arch_type'], {'arch_type': {'Low': 0.0, 'Neutral': 0.5, 'High': 1.0}})
    feats['heel_stiff_moderate'] = get_priority_val(user_input, ['arch_type'], {'arch_type': {'Low': 0.5, 'Neutral': 1.0, 'High': 1.0}})
    feats['heel_stiff_stiff']    = get_priority_val(user_input, ['arch_type'], {'arch_type': {'Low': 1.0, 'Neutral': 0.5, 'High': 0.0}})

    # Plate: Pace > Purpose
    prio_plate = ['pace', 'running_purpose']
    feats['plate_rock'] = get_priority_val(user_input, prio_plate, {'pace': {'Easy': 0.5, 'Steady': 0.5, 'Fast': 0.5}, 'running_purpose': {'Daily': 0.5, 'Tempo': 0.5, 'Race': 0.5}})
    feats['plate_carbon'] = get_priority_val(user_input, prio_plate, {'pace': {'Easy': 0.5, 'Steady': 0.5, 'Fast': 1.0}, 'running_purpose': {'Daily': 0.5, 'Tempo': 0.5, 'Race': 1.0}})

    # Stack Height: Strike > Pace > Purpose
    prio_stack = ['strike_pattern', 'pace', 'running_purpose']
    feats['heel_lab_mm'] = get_priority_val(user_input, prio_stack, {'strike_pattern': {'Heel': 1.0, 'Mid': 0.5, 'Forefoot': 0.0}, 'pace': {'Easy': 1.0, 'Steady': 0.5, 'Fast': 0.0}, 'running_purpose': {'Daily': 1.0, 'Tempo': 0.5, 'Race': 0.5}})
    feats['forefoot_lab_mm'] = get_priority_val(user_input, prio_stack, {'strike_pattern': {'Heel': 0.0, 'Mid': 0.5, 'Forefoot': 1.0}, 'pace': {'Easy': 0.0, 'Steady': 0.5, 'Fast': 1.0}, 'running_purpose': {'Daily': 1.0, 'Tempo': 0.5, 'Race': 0.5}})

    # Weight (Invers Lightweight) & Defaults
    feats['weight_lab_oz'] = 1.0 - feats['lightweight']
    feats['toebox_durability'] = 1.0
    feats['heel_durability'] = 1.0
    feats['outsole_durability'] = 1.0
    feats['breathability'] = 1.0

    # Seasons
    seasons = user_input.get('season', [])
    has_summer = 1.0 if 'Summer' in seasons else 0.0
    has_winter = 1.0 if 'Winter' in seasons else 0.0
    has_all    = 1.0 if 'All' in seasons else 0.0
    feats['season_summer'] = max(has_summer*1.0, has_winter*0.0, has_all*0.5)
    feats['season_winter'] = max(has_summer*0.0, has_winter*1.0, has_all*0.5)
    feats['season_all']    = max(has_summer*0.5, has_winter*0.5, has_all*1.0)
    
    # 3. MASKING LOGIC (Hanya ambil fitur yang diisi)
    provided_inputs = {k for k, v in user_input.items() if v}
    
    # Dependency Map: Input apa menyalakan Fitur apa
    feature_sources = {
        'lightweight': ['pace'], 'rocker': ['running_purpose'],
        'orthotic_friendly': ['orthotic_usage'], 'removable_insole': ['orthotic_usage'],
        'pace_daily_running': ['running_purpose'], 'pace_tempo': ['running_purpose'], 'pace_competition': ['running_purpose'],
        'arch_neutral': ['arch_type'], 'arch_stability': ['arch_type'],
        'drop_lab_mm': ['pace'],
        'strike_heel': ['strike_pattern', 'pace'], 'strike_mid': ['strike_pattern', 'pace'], 'strike_forefoot': ['strike_pattern', 'pace'],
        'softness_soft': ['cushion_preferences', 'pace'], 'softness_balanced': ['cushion_preferences', 'pace'], 'softness_firm': ['cushion_preferences', 'pace'],
        'width_narrow': ['stability_need', 'foot_width'], 'width_medium': ['stability_need', 'foot_width'], 'width_wide': ['stability_need', 'foot_width'],
        'toebox_narrow': ['stability_need'], 'toebox_medium': ['stability_need'], 'toebox_wide': ['stability_need'],
        'stiffness_flexible': ['arch_type', 'pace', 'running_purpose'], 'stiffness_moderate': ['arch_type', 'pace', 'running_purpose'], 'stiffness_stiff': ['arch_type', 'pace', 'running_purpose'],
        'torsional_flexible': ['arch_type', 'pace'], 'torsional_moderate': ['arch_type', 'pace'], 'torsional_stiff': ['arch_type', 'pace'],
        'heel_stiff_flexible': ['arch_type'], 'heel_stiff_moderate': ['arch_type'], 'heel_stiff_stiff': ['arch_type'],
        'plate_rock': ['pace', 'running_purpose'], 'plate_carbon': ['pace', 'running_purpose'],
        'heel_lab_mm': ['strike_pattern', 'pace', 'running_purpose'], 'forefoot_lab_mm': ['strike_pattern', 'pace', 'running_purpose'],
        'weight_lab_oz': ['pace'],
        'season_summer': ['season'], 'season_winter': ['season'], 'season_all': ['season'],
        # Default Features (selalu ignore di similarity kecuali diminta khusus)
        'toebox_durability': [], 'heel_durability': [], 'outsole_durability': [], 'breathability': []
    }
    
    # Susun Vector Lengkap
    all_cols = binary_cols + continuous_cols
    full_vector_raw = []
    for col in binary_cols:
        full_vector_raw.append(feats.get(col, 0.0))
    for col in continuous_cols:
        full_vector_raw.append(feats.get(col, 0.5))

    # Tentukan Index Valid
    valid_indices = []
    for i, col in enumerate(all_cols):
        sources = feature_sources.get(col, [])
        if any(src in provided_inputs for src in sources):
            valid_indices.append(i)
            
    # Jika tidak ada input sama sekali, pakai semua (fallback)
    if not valid_indices:
        valid_indices = list(range(len(all_cols)))
        
    return full_vector_raw, valid_indices

# --- RECOMMENDER MAIN FUNCTION ---
def recommend_shoes_deep_masked(user_input, df_data, encoder_model, kmeans_model, binary_cols, continuous_cols, X_combined_data):
    # 1. Preprocess & Get Mask
    full_vector, valid_idx = preprocess_user_input_with_mask(user_input, binary_cols, continuous_cols)
    full_vector = np.array([full_vector])

    # 2. Clustering (Pakai Vector Lengkap dengan asumsi netral)
    user_latent = encoder_model.predict(full_vector, verbose=0)
    distances = kmeans_model.transform(user_latent)[0]
    n_select = math.ceil(kmeans_model.n_clusters / 3)
    closest_clusters = np.argsort(distances)[:n_select]
    
    print(f"User mapped to Clusters: {closest_clusters}")
    
    # 3. Filter Candidates
    candidates = df_data[df_data['cluster'].isin(closest_clusters)].copy()
    if candidates.empty: return pd.DataFrame()
    
    # 4. Masked Scoring (Hanya fitur yang relevan)
    candidate_vectors = X_combined_data[candidates.index]
    
    # Slicing Vector
    user_vec_masked = full_vector[:, valid_idx]
    cand_vecs_masked = candidate_vectors[:, valid_idx]
    
    # Hitung Similarity
    if np.all(user_vec_masked == 0):
        scores = np.zeros(len(candidates))
    else:
        scores = cosine_similarity(user_vec_masked, cand_vecs_masked)[0]
    
    # 5. Result
    candidates['match_score'] = scores
    cols_show = ['brand', 'name', 'match_score', 'cluster', 'price']
    cols_show = [c for c in cols_show if c in candidates.columns]
    
    return candidates.sort_values('match_score', ascending=False).head(10)[cols_show]

# Testing

In [19]:
# --- 1. SETUP OPSI INPUT ---
# Daftar kemungkinan jawaban untuk setiap input
input_options = {
    'running_purpose': ['Daily', 'Tempo', 'Race'],
    'pace': ['Easy', 'Steady', 'Fast'],
    'orthotic_usage': ['Yes', 'No'],
    'arch_type': ['Low', 'Neutral', 'High'],
    'strike_pattern': ['Heel', 'Mid', 'Forefoot'],
    'cushion_preferences': ['Soft', 'Balanced', 'Firm'],
    'foot_width': ['Narrow', 'Medium', 'Wide'],
    'stability_need': ['Neutral', 'Guided'],
    'season': [['Summer'], ['Winter'], ['All'], ['Summer', 'All'], ['Winter', 'All']]
}

def generate_random_user_input(num_features):
    """
    Membuat dictionary input user dengan jumlah fitur acak tertentu.
    """
    # Ambil semua key yang tersedia
    all_keys = list(input_options.keys())
    
    # Pilih 'num_features' key secara acak
    selected_keys = random.sample(all_keys, k=min(num_features, len(all_keys)))
    
    # Isi nilai untuk key yang terpilih
    user_input = {}
    for key in selected_keys:
        user_input[key] = random.choice(input_options[key])
        
    return user_input

# --- 2. GENERATE & RUN TEST CASES ---
# Kita buat skenario jumlah input: 3, 6, dan 9 (Full)
target_counts = [3, 6, 9]

print("=== MULAI PENGUJIAN RANDOM ===")

for i, count in enumerate(target_counts):
    print(f"\n{'-'*60}")
    print(f"TEST CASE #{i+1}: User mengisi {count} fitur")
    
    # 1. Generate Input
    random_input = generate_random_user_input(count)
    print(f"Input User:\n{random_input}")
    
    # 2. Jalankan Rekomendasi
    # Pastikan variabel df, encoder, dll sudah ada dari cell sebelumnya
    try:
        recommendations = recommend_shoes_deep_masked(
            random_input, 
            df, 
            encoder, 
            best_model, 
            binary_cols, 
            continuous_cols, 
            X_combined
        )
        
        # 3. Tampilkan Hasil
        if not recommendations.empty:
            print("\nTop 10 Rekomendasi:")
            # Tampilkan kolom yang relevan saja agar rapi
            cols = ['brand', 'name', 'match_score', 'cluster']
            display(recommendations.head(10)[cols])
        else:
            print("\nTidak ada rekomendasi ditemukan (Cluster mungkin kosong).")
            
    except NameError:
        print("\nERROR: Pastikan kode setup model dan fungsi 'recommend_shoes_deep_masked' sudah dijalankan sebelumnya.")
    except Exception as e:
        print(f"\nERROR: {e}")

=== MULAI PENGUJIAN RANDOM ===

------------------------------------------------------------
TEST CASE #1: User mengisi 3 fitur
Input User:
{'strike_pattern': 'Mid', 'running_purpose': 'Tempo', 'season': ['Summer']}
User mapped to Clusters: [1 2 5]

Top 10 Rekomendasi:


Unnamed: 0,brand,name,match_score,cluster
373,saucony,tempus,0.813979,5
101,saucony,endorphin speed 4,0.79552,1
288,asics,noosa tri 15,0.795089,1
6,adidas,adizero sl2,0.79506,1
23,adidas,adizero sl2,0.79506,1
229,brooks,hyperion 2,0.792007,1
148,new balance,fuelcell rebel v2,0.784478,2
95,asics,magic speed 4,0.779141,2
351,hoka,skyward x,0.779133,2
146,new balance,fuelcell propel v5,0.776251,1



------------------------------------------------------------
TEST CASE #2: User mengisi 6 fitur
Input User:
{'arch_type': 'Neutral', 'strike_pattern': 'Forefoot', 'orthotic_usage': 'Yes', 'season': ['Winter'], 'foot_width': 'Wide', 'stability_need': 'Guided'}
User mapped to Clusters: [5 1 6]

Top 10 Rekomendasi:


Unnamed: 0,brand,name,match_score,cluster
399,asics,versablast 4,0.766933,5
305,new balance,propel v4,0.761979,6
246,hoka,kawana 2,0.760841,5
427,reebok,zig dynamica 5,0.759725,5
207,skechers,go run ride 11,0.759447,6
88,asics,dynablast 4,0.758825,5
260,brooks,levitate stealthfit 6,0.757352,6
354,hoka,solimar,0.757336,1
144,new balance,fresh foam x tempo v2,0.755313,1
149,new balance,fuelcell rebel v3,0.735875,1



------------------------------------------------------------
TEST CASE #3: User mengisi 9 fitur
Input User:
{'strike_pattern': 'Forefoot', 'arch_type': 'Neutral', 'running_purpose': 'Race', 'orthotic_usage': 'No', 'foot_width': 'Wide', 'stability_need': 'Guided', 'season': ['All'], 'pace': 'Easy', 'cushion_preferences': 'Balanced'}
User mapped to Clusters: [2 6 1]

Top 10 Rekomendasi:


Unnamed: 0,brand,name,match_score,cluster
279,asics,metaspeed sky paris,0.759011,2
280,asics,metaspeed sky tokyo,0.756011,2
339,hoka,rocket x 3,0.746647,2
39,nike,alphafly 2,0.742399,2
20,adidas,adizero prime x 2 strung,0.738477,2
101,saucony,endorphin speed 4,0.735896,1
277,asics,metaspeed ray,0.729334,2
64,on,cloudboom echo 3,0.728822,2
275,asics,metaspeed edge tokyo,0.728247,2
361,asics,superblast,0.727155,2


# Save Model

In [20]:
# 1. Buat nama folder unik berdasarkan waktu sekarang
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
save_dir = f"../../model_artifacts/road/v_{timestamp}"

os.makedirs(save_dir, exist_ok=True)
print(f"Saving models to: {save_dir}")

# 2. Simpan file dengan nama STANDAR di dalam folder tersebut
# Perhatikan: Nama file TIDAK pakai tanggal, foldernya yang pakai.
encoder.save(f'{save_dir}/shoe_encoder.keras')

with open(f'{save_dir}/kmeans_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

df.to_pickle(f'{save_dir}/shoe_metadata.pkl')

with open(f'{save_dir}/shoe_features.pkl', 'wb') as f:
    pickle.dump(X_combined, f)

Saving models to: ../../model_artifacts/road/v_20260210_091709
