# Import Library & Configuration

In [25]:
import os
os.environ['PYTHONHASHSEED'] = '0'
os.environ['TF_DETERMINISTIC_OPS'] = '1'
os.environ['TF_CUDNN_DETERMINISM'] = '1'

import random
random.seed(42)

import math
import pandas as pd
import numpy as np
np.random.seed(42)

from datetime import datetime
import pickle
import warnings
warnings.filterwarnings('ignore')

"""
Deep Learning Framework
TensorFlow & Keras: Autoencoder architecture with Dense, BatchNormalization, Dropout layers
"""
import tensorflow as tf
tf.random.set_seed(42)
from tensorflow.keras import layers, Model, optimizers, callbacks
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization

"""
Machine Learning & Metrics
- KMeans: K-means clustering for shoe recommendation groups
- StandardScaler/MinMaxScaler: Feature normalization for ML models
- Clustering Metrics: Silhouette, Davies-Bouldin, Calinski-Harabasz indices
- Similarity: Cosine similarity for recommendation ranking
"""
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import (
    silhouette_score, davies_bouldin_score, calinski_harabasz_score,
    adjusted_rand_score
)
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.ensemble import RandomForestClassifier

"""
Data Visualization
Matplotlib & Seaborn for statistical plots and cluster visualization
"""
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)

np.random.seed(42)
tf.random.set_seed(42)

print('Libraries loaded')
print(f'Time: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')

Libraries loaded
Time: 2026-02-14 11:47:50


# Load Data

In [26]:
file = '../../data/road_dataset.csv'

try:
    df = pd.read_csv(file)
    print(f'Loaded: {df.shape[0]} shoes × {df.shape[1]} columns')
    display(df.head())
except FileNotFoundError:
    print(f"WARNING: '{file}' not found.")
    print("Please upload the correct dataset file to run with actual data.")

Loaded: 428 shoes × 32 columns


Unnamed: 0,brand,name,lightweight,rocker,removable_insole,pace_daily_running,pace_tempo,pace_competition,arch_neutral,arch_stability,...,stiffness_scaled,torsional_rigidity,heel_stiff,plate_rock_plate,plate_carbon_plate,heel_lab_mm,forefoot_lab_mm,season_summer,season_winter,season_all
0,brooks,launch 9,1,0,1,1,1,0,1,0,...,5,5,1,0,0,32.4,23.0,0,0,0
1,brooks,levitate 6,0,0,1,1,0,0,1,0,...,5,3,3,0,0,34.3,26.6,1,0,1
2,adidas,4dfwd,0,0,1,1,0,0,1,0,...,5,1,1,0,0,33.3,24.4,0,0,1
3,adidas,4dfwd 2,0,0,1,1,0,0,1,0,...,5,1,3,0,0,31.8,21.2,0,0,1
4,adidas,4dfwd 3,0,0,1,1,0,0,1,0,...,3,1,1,0,0,32.6,22.7,0,0,1


In [27]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 428 entries, 0 to 427
Data columns (total 32 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   brand                 428 non-null    str    
 1   name                  428 non-null    str    
 2   lightweight           428 non-null    int64  
 3   rocker                428 non-null    int64  
 4   removable_insole      428 non-null    int64  
 5   pace_daily_running    428 non-null    int64  
 6   pace_tempo            428 non-null    int64  
 7   pace_competition      428 non-null    int64  
 8   arch_neutral          428 non-null    int64  
 9   arch_stability        428 non-null    int64  
 10  weight_lab_oz         428 non-null    float64
 11  drop_lab_mm           428 non-null    float64
 12  strike_heel           428 non-null    int64  
 13  strike_mid            428 non-null    int64  
 14  strike_forefoot       428 non-null    int64  
 15  midsole_softness      428 non-null

# Preprocessing

## Feature Engineering
Separates numeric features into two categories for different preprocessing strategies.

In [28]:
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

binary_cols = [col for col in numeric_cols if set(df[col].unique()).issubset({0, 1})]
continuous_cols = [col for col in numeric_cols if col not in binary_cols]

print(f'Features: {len(numeric_cols)} total')
print(f'  Binary     : {len(binary_cols)}')
print(f'  Continuous : {len(continuous_cols)}')

Features: 30 total
  Binary     : 16
  Continuous : 14


## Normalization
- Binary features: kept as-is (0-1 range)
- Continuous features: MinMaxScaler to [0, 1]
- Combined array: binary + continuous scaled features
This ensures neural network compatibility and distance metric compatibility.

In [29]:
feature_cols = numeric_cols.copy()
X = df[feature_cols]

X_binary = X[binary_cols].values
X_continuous = X[continuous_cols].values

scaler_continuous = MinMaxScaler()
X_continuous_scaled = scaler_continuous.fit_transform(X_continuous)

X_combined = np.concatenate([X_binary, X_continuous_scaled], axis=1)

scaler_standard = StandardScaler()
X_standard = scaler_standard.fit_transform(X)

print(f'Neural input shape: {X_combined.shape}')
print(f'Range: [{X_combined.min():.6f}, {X_combined.max():.6f}]')

Neural input shape: (428, 30)
Range: [0.000000, 1.000000]


# Auto-Encoder

## Modelling
- Purpose: Dimensionality reduction (high-D features → 8D latent space)
- Architecture: Encoder [input → 32 → 16 → 8] + Decoder [8 → 16 → 32 → reconstructed]
- Regularization: BatchNormalization + Dropout(0.2) at each dense layer
- Loss: MSE (reconstruction error) | Optimizer: Adam(lr=0.001)

In [30]:
input_dim = X_combined.shape[1]
encoding_dims = [32, 16, 8]

input_layer = Input(shape=(input_dim,))
x = input_layer
for dim in encoding_dims:
    x = Dense(dim, activation='relu')(x)
    x = BatchNormalization()(x)
    x = Dropout(0.3)(x)

latent = x

for dim in reversed(encoding_dims[:-1]):
    x = Dense(dim, activation='relu')(x)
    x = BatchNormalization()(x)
    x = Dropout(0.3)(x)

output_layer = Dense(input_dim, activation='sigmoid')(x)

autoencoder = Model(input_layer, output_layer)
encoder = Model(input_layer, latent)

autoencoder.compile(
    optimizer=optimizers.Adam(0.001),
    loss='mse',
    metrics=['mae']
)

print('Autoencoder architecture:')
autoencoder.summary()

Autoencoder architecture:


## Training
Training Configuration
- Epochs: 300 (with early stopping)
- Batch size: 64
- Validation split: 20%
- Early stopping: patience=20 (stop if val_loss doesn't improve)
- LR reduction: factor=0.5, patience=10, min_lr=1e-5
- Output: X_latent (8D embeddings) for KMeans clustering

In [31]:
history = autoencoder.fit(
    X_combined, X_combined,
    epochs=300,
    batch_size=64,
    validation_split=0.25,
    shuffle=False,
    callbacks=[
        callbacks.EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True),
        callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, min_lr=1e-5)
    ],
    verbose=0
)

print(f'Training done!')
print(f'Final loss: {history.history["loss"][-1]:.6f}')
print(f'  Val loss: {history.history["val_loss"][-1]:.6f}')

X_latent = encoder.predict(X_combined, verbose=0)
print(f'Latent space: {X_latent.shape} (8D embeddings)')

Training done!
Final loss: 0.098345
  Val loss: 0.074054
Latent space: (428, 8) (8D embeddings)


# Metrics Function

## Interpretability Score

In [32]:
def calculate_interpretability_score(df, cluster_col, binary_cols, top_n=5, threshold=0.70):
    """
    Calculate cluster interpretability based on top feature clarity.
    
    Strategy: Focus on strongest features (not averages) to reduce noise.
    Strength Definition: Distance from neutral point (0.5)
      - Features closer to 0 or 1 = strong patterns
      - Features closer to 0.5 = ambiguous patterns
    
    Args:
        df (pd.DataFrame): Cluster-labeled dataset
        cluster_col (str): Column with cluster assignments
        binary_cols (list): Binary feature names
        top_n (int): Number of top features to evaluate (default: 5)
        threshold (float): Unused parameter (API compatibility)
    
    Returns:
        dict: {'mean_interpretability': float [0, 1]}
              1.0 = clear feature patterns, 0.0 = no patterns
    """
    scores = []
    unique_clusters = df[cluster_col].unique()
    
    for cid in unique_clusters:
        cdata = df[df[cluster_col] == cid]
        n = len(cdata)
        if n == 0: 
            scores.append(0)
            continue
        
        feature_strength = []
        for col in binary_cols:
            if col in cdata.columns:
                avg = cdata[col].mean()
                strength = abs(avg - 0.5) * 2  # Normalize distance from neutral (0.5) to [0, 1]
                feature_strength.append(strength)
        
        if feature_strength:
            feature_strength.sort(reverse=True)
            top_features = feature_strength[:top_n]
            scores.append(np.mean(top_features))  # Average of top-N features
        else:
            scores.append(0)
            
    return {'mean_interpretability': np.mean(scores) if scores else 0}

## Cluster Purity

In [33]:
def calculate_cluster_purity(df, cluster_col, binary_cols):
    """
    Measure internal cluster homogeneity via majority class dominance.
    
    Purity Calculation: For each feature, compute max(class0_pct, class1_pct)
    Average across all features = cluster purity
    Range: [0.5, 1.0] where 1.0 = perfect homogeneity
    
    Args:
        df (pd.DataFrame): Cluster-labeled dataset
        cluster_col (str): Column with cluster assignments
        binary_cols (list): Binary feature names
    
    Returns:
        dict: {'mean_purity': float [0.5, 1.0]}
    """
    purity_by_cluster = []
    unique_clusters = df[cluster_col].unique()
    
    for cid in unique_clusters:
        cdata = df[df[cluster_col] == cid]
        n = len(cdata)
        if n == 0: continue
        
        dominances = []
        for col in binary_cols:
            if col in cdata.columns:
                avg = cdata[col].mean()
                dominances.append(max(avg, 1 - avg))  # Majority class percentage
        
        if dominances:
            purity_by_cluster.append(np.mean(dominances))
    
    return {'mean_purity': np.mean(purity_by_cluster) if purity_by_cluster else 0}

## Cluster Stability

In [34]:
def calculate_cluster_stability(X, labels, model_func, n_iter=5, seed=42):
    """
    Bootstrap stability testing via Adjusted Rand Index (ARI).
    
    Process:
    1. Train model on bootstrap sample (with replacement)
    2. Compare original vs bootstrap clustering using ARI
    3. Average ARI across iterations
    
    ARI Range: [-1, 1]
      > 0.5: excellent stability
      0.2-0.5: fair stability
      < 0.2: poor stability
    
    Args:
        X (np.ndarray): Feature matrix
        labels (np.ndarray): Original cluster assignments
        model_func (callable): Returns instantiated clustering model
        n_iter (int): Bootstrap iterations (default: 5)
    
    Returns:
        dict: {'mean_ari': float [-1, 1]}
    """
    if len(np.unique(labels)) < 2:
        return {'mean_ari': 0}

    n = len(X)
    ari_scores = []

    for i in range(n_iter):
        rng = np.random.default_rng(seed=42 + i)
        idx = rng.choice(n, n, replace=True)
        try:
            boot_model = model_func()
            boot_labels = boot_model.fit_predict(X[idx])
            ari = adjusted_rand_score(labels[idx], boot_labels)
            ari_scores.append(ari)
        except Exception:
            continue

    m = np.mean(ari_scores) if ari_scores else 0
    return {'mean_ari': m}

## Comprehensive Evaluation

In [35]:
def evaluate_clustering_comprehensive(X, labels, df_original, model_func, binary_cols):
    """
    Multi-metric clustering evaluation combining geometric and business metrics.
    
    Evaluation Framework:
    
    1. GEOMETRIC METRICS (Scikit-learn):
       Silhouette [-1, 1]: cluster separation quality
       Davies-Bouldin [0, ∞): intra-cluster density (lower better)
       Calinski-Harabasz [0, ∞): cluster definition (higher better)
    
    2. BUSINESS METRICS:
       Purity: internal homogeneity
       Interpretability: feature pattern clarity
       Stability: clustering consistency
    
    3. COMPOSITE SCORING (strategic weights):
       Structure (40%): 40% Silhouette + 30% Davies-Bouldin + 30% Calinski-Harabasz
       Explainability (30%): 50% Interpretability + 50% Purity
       Reliability (30%): Bootstrap ARI stability
    
    Args:
        X (np.ndarray): Latent feature space (typically autoencoder output)
        labels (np.ndarray): Cluster assignments [0, K-1]
        df_original (pd.DataFrame): Original shoe metadata
        model_func (callable): KMeans factory function
        binary_cols (list): Binary feature column names
    
    Returns:
        dict: {
            'metrics': {silhouette, davies_bouldin, calinski_harabasz, purity, stability, interpretability},
            'composite_score': float [0, 1]
        }
    """
    df_eval = df_original.copy()
    df_eval['cluster'] = labels
    
    sil = silhouette_score(X, labels)
    db = davies_bouldin_score(X, labels)
    ch = calinski_harabasz_score(X, labels)
    
    purity_res = calculate_cluster_purity(df_eval, 'cluster', binary_cols)
    interp_res = calculate_interpretability_score(df_eval, 'cluster', binary_cols, top_n=5)
    stability_res = calculate_cluster_stability(X, labels, model_func, n_iter=3)
    
    val_purity = purity_res['mean_purity']
    val_interp = interp_res['mean_interpretability']
    val_stability = stability_res['mean_ari']

    sil_norm = (sil + 1) / 2  # Map Silhouette [-1, 1] → [0, 1]
    db_norm = np.exp(-0.5 * db)  # Exponential decay: DB lower is better
    
    if ch > 0:
        ch_log = np.log1p(ch)
        ch_norm = min(ch_log / 8, 1.0)  # Log scaling: assume max log(CH) ≈ 9.2
    else:
        ch_norm = 0

    score_structure = (0.4 * sil_norm) + (0.3 * db_norm) + (0.3 * ch_norm)  # Weight: Silhouette 40%, DB 30%, CH 30%
    score_explain = (0.5 * val_interp) + (0.5 * val_purity)  # Weight: Interpretability 50%, Purity 50%
    score_reliability = max(val_stability, 0)  # Clip negative ARI to 0

    composite = (0.40 * score_structure) + (0.30 * score_explain) + (0.30 * score_reliability)

    return {
        'metrics': {
            'silhouette': sil, 
            'davies_bouldin': db, 
            'calinski_harabasz': ch,
            'purity': val_purity, 
            'stability': val_stability,
            'interpretability': val_interp
        },
        'composite_score': composite
    }

print('Metrics Function Ready.')

Metrics Function Ready.


# Model Selection
Model Selection Pipeline: K-means Clustering (K=3 to K=9)

For each K value:
  - Train KMeans model
  - Evaluate using comprehensive metrics
  - Compute composite score

Select K with highest composite score (40% geometry, 30% explainability, 30% reliability)

In [36]:
results = []

print(f"| {'K':^3} | {'Score':^8} | {'Sil.':^8} | {'DB':^8} | {'CH':^10} | {'Purity':^8} | {'Stab.':^8} | {'Interp':^8} |")
print(f"|{'-'*5}+{'-'*10}+{'-'*10}+{'-'*10}+{'-'*12}+{'-'*10}+{'-'*10}+{'-'*10}|")

for i in range(3, 10):
    model_factory = lambda: KMeans(n_clusters=i, random_state=42, n_init=20)
    
    model = model_factory()
    labels = model.fit_predict(X_latent)

    metrics_res = evaluate_clustering_comprehensive(
        X_latent, labels, df.copy(),
        model_factory,
        binary_cols
    )

    raw_metrics = metrics_res['metrics'] 
    comp_score  = metrics_res['composite_score']

    record = {
        'k': i,
        'model': model,
        'labels': labels,
        'composite_score': comp_score,
        **raw_metrics
    }
    results.append(record)

    print(f"| {i:^3} | {comp_score:<8.6f} | {raw_metrics['silhouette']:<6.6f} | "
          f"{raw_metrics['davies_bouldin']:<6.6f} | {raw_metrics['calinski_harabasz']:<8.6f} | "
          f"{raw_metrics['purity']:<6.6f} | {raw_metrics['stability']:<6.6f} | {raw_metrics['interpretability']:<6.6f} |")

|  K  |  Score   |   Sil.   |    DB    |     CH     |  Purity  |  Stab.   |  Interp  |
|-----+----------+----------+----------+------------+----------+----------+----------|
|  3  | 0.725575 | 0.442093 | 1.004044 | 223.989851 | 0.849337 | 0.628044 | 0.936774 |
|  4  | 0.860043 | 0.530613 | 0.835884 | 336.071705 | 0.863760 | 0.982536 | 0.979710 |
|  5  | 0.877370 | 0.555704 | 0.694095 | 438.120406 | 0.866944 | 0.997151 | 0.984283 |
|  6  | 0.849927 | 0.485995 | 0.751707 | 440.021792 | 0.868007 | 0.930737 | 0.985898 |
|  7  | 0.818026 | 0.469977 | 0.834006 | 411.950115 | 0.858330 | 0.847469 | 0.986705 |
|  8  | 0.783303 | 0.468941 | 0.889430 | 396.703919 | 0.870448 | 0.734619 | 0.987520 |
|  9  | 0.789632 | 0.466899 | 0.874739 | 399.636980 | 0.874462 | 0.749608 | 0.992294 |


In [37]:
df_results = pd.DataFrame(results)

best_idx = df_results['composite_score'].idxmax()
best_config = df_results.loc[best_idx]

best_model = best_config['model']
best_labels = best_config['labels']
best_k = best_config['k']
X_for_clustering = X_latent

print("-" * 80)
print(f'SELECTED BEST K: {best_k}')
print(f'   Silhouette      : {best_config["silhouette"]:.6f}') 
print(f'   Composite Score : {best_config["composite_score"]:.6f}')

--------------------------------------------------------------------------------
SELECTED BEST K: 5
   Silhouette      : 0.555704
   Composite Score : 0.877370


# Generate Cluster Label

## Binning
Divides each continuous feature into 3 quantile bins (tertiles).

Labels: 0 (low), 0.5 (medium), 1 (high)

Enables interpretable cluster profiling and feature discretization.

In [38]:
for col in df.select_dtypes('float64').columns.tolist():
    new_col_name = col + '_bin'
    df[new_col_name] = pd.qcut(df[col], q=3, labels=[0, 0.5, 1]).astype(int)

non_numeric_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()

new_column_order = []

for col in non_numeric_cols:
    if col in df.columns:
        new_column_order.append(col)

for col in binary_cols:
    if col in df.columns:
        new_column_order.append(col)

for col in continuous_cols:
    if col in df.columns:
        new_column_order.append(col)
    bin_col_name = col + '_bin'
    if bin_col_name in df.columns:
        new_column_order.append(bin_col_name)

if 'cluster' in df.columns and 'cluster' not in new_column_order:
    new_column_order.append('cluster')

df = df[new_column_order]

In [39]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 428 entries, 0 to 427
Data columns (total 36 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   brand                 428 non-null    str    
 1   name                  428 non-null    str    
 2   lightweight           428 non-null    int64  
 3   rocker                428 non-null    int64  
 4   removable_insole      428 non-null    int64  
 5   pace_daily_running    428 non-null    int64  
 6   pace_tempo            428 non-null    int64  
 7   pace_competition      428 non-null    int64  
 8   arch_neutral          428 non-null    int64  
 9   arch_stability        428 non-null    int64  
 10  strike_heel           428 non-null    int64  
 11  strike_mid            428 non-null    int64  
 12  strike_forefoot       428 non-null    int64  
 13  plate_rock_plate      428 non-null    int64  
 14  plate_carbon_plate    428 non-null    int64  
 15  season_summer         428 non-null

## Cluster Summary
Creates interpretable profile for each cluster showing:
- Size (count + percentage)
- Continuous features (mean values)
- Binary features (dominant variant + prevalence)

In [40]:
df['cluster'] = best_labels 

bin_groups = {}
for col in binary_cols:
    parts = col.split('_')
    
    if len(parts) > 1:
        prefix = '_'.join(parts[:-1])
    else:
        prefix = col
        
    bin_groups.setdefault(prefix, []).append(col)

rows = []
for cid in sorted(df['cluster'].unique()):
    subset = df[df['cluster'] == cid]
    n = len(subset)
    
    row = {'count': n, 'percentage': f"{n/len(df)*100:.1f}%"}

    for col in continuous_cols:
        row[col.lower()] = round(subset[col].mean(), 2)

    for prefix, cols in bin_groups.items():
        means = subset[cols].mean()
        best_col = means.idxmax()
        best_val = means.max()
        
        if len(cols) > 1:
            header = prefix.lower()
            val_str = best_col.replace(f"{prefix}_", "").lower()
            row[header] = f"{val_str} ({best_val*100:.0f}%)"
            
        else:
            header = cols[0].lower()
            val_str = "yes" if best_val > 0.5 else "no"
            row[header] = f"{val_str} ({best_val*100:.0f}%)"

    rows.append(row)

df_summary = pd.DataFrame(rows, index=sorted(df['cluster'].unique()))
df_summary.index.name = None 

print("Cluster Summary:")
display(df_summary)

Cluster Summary:


Unnamed: 0,count,percentage,weight_lab_oz,drop_lab_mm,midsole_softness,toebox_durability,heel_durability,outsole_durability,breathability_scaled,width_fit,...,lightweight,rocker,removable_insole,pace_daily_running,pace,arch,strike,plate_rock_plate,plate_carbon_plate,season
0,29,6.8%,9.7,10.46,0.03,0.0,0.0,0.0,0.0,2.66,...,no (17%),no (10%),no (28%),yes (100%),tempo (10%),neutral (76%),heel (83%),no (0%),no (0%),summer (0%)
1,74,17.3%,7.47,7.66,3.5,2.0,2.65,2.43,4.11,2.11,...,yes (92%),yes (70%),yes (80%),no (9%),competition (65%),neutral (99%),mid (86%),no (1%),yes (64%),all (95%)
2,165,38.6%,9.68,7.35,3.87,3.05,3.4,3.64,3.07,2.89,...,no (16%),no (30%),yes (99%),yes (96%),tempo (18%),neutral (85%),mid (100%),no (0%),no (5%),all (98%)
3,78,18.2%,9.69,8.2,2.69,0.44,0.29,0.15,2.01,1.74,...,no (23%),no (29%),yes (96%),yes (96%),tempo (18%),neutral (86%),mid (81%),no (0%),no (1%),all (60%)
4,82,19.2%,9.98,11.66,3.78,2.84,3.26,3.35,3.1,2.71,...,no (11%),no (17%),yes (100%),yes (94%),tempo (17%),neutral (77%),heel (100%),no (0%),no (4%),all (96%)


# Deep Learn Recommender

## Priority Handler

In [41]:
def get_priority_val(user_input, priority_list, mapping_dicts):
    """
    Extract feature value from user input with priority hierarchy.
    
    Strategy: Checks inputs in priority order, returns mapped value from
    first non-empty input, ignores lower-priority inputs if higher-priority exists.
    
    Args:
        user_input (dict): User preferences {'running_purpose': 'Daily', ...}
        priority_list (list): Input sources in priority order
        mapping_dicts (dict): Maps {source: {option: feature_value}}
    
    Returns:
        float: Feature value [0, 1] or 0.5 (neutral) if not found
    """
    for source_key in priority_list:
        if source_key in user_input and user_input[source_key]:
            user_choice = user_input[source_key]
            if source_key in mapping_dicts:
                mapping = mapping_dicts[source_key]
                if user_choice in mapping:
                    return mapping[user_choice]
    return 0.5

## Input Handler

In [42]:
def preprocess_user_input_with_mask(user_input, binary_cols, continuous_cols):
    """
    Transform user preferences into feature vector with intelligent masking.
    
    Feature Construction:
    1. SIMPLE FEATURES (single source dependency e.g., lightweight from pace)
    2. PRIORITY OVERWRITE (multi-source with hierarchy e.g., strike_pattern > pace)
    3. MASKING (only include features derived from provided inputs)
    4. FALLBACK (default to 0.5 for unknowns)
    
    Returns:
        tuple: (full_vector_raw, valid_indices)
            - full_vector_raw: Feature vector [0-1] for all features
            - valid_indices: Positions of user-provided features
                (used for masked similarity calculation)
    """
    feats = {col: 0.0 for col in binary_cols + continuous_cols}
    
    feats['lightweight'] = get_priority_val(user_input, ['pace'], 
        {'pace': {'Easy': 0.5, 'Steady': 0.5, 'Fast': 1.0}})
    feats['rocker'] = get_priority_val(user_input, ['running_purpose'], 
        {'running_purpose': {'Race': 1.0, 'Tempo': 0.5, 'Daily': 0.0}})
    feats['removable_insole'] = get_priority_val(user_input, ['orthotic_usage'], 
        {'orthotic_usage': {'Yes': 1.0, 'No': 0.5}})
    
    purp = user_input.get('running_purpose', 'Daily')
    feats['pace_daily_running'] = 1.0 if purp == 'Daily' else (0.5 if purp == 'Tempo' else 0.0)
    feats['pace_tempo'] = 1.0 if purp == 'Tempo' else 0.5
    feats['pace_competition'] = 1.0 if purp == 'Race' else (0.5 if purp == 'Tempo' else 0.0)

    feats['arch_neutral'] = get_priority_val(user_input, ['arch_type'], 
        {'arch_type': {'Flat': 0.0, 'Normal': 0.8, 'High': 1.0}})
    feats['arch_stability'] = get_priority_val(user_input, ['arch_type'], 
        {'arch_type': {'Flat': 1.0, 'Normal': 0.2, 'High': 0.0}})
    
    feats['drop_lab_mm'] = get_priority_val(user_input, ['pace'], 
        {'pace': {'Easy': 1.0, 'Steady': 0.5, 'Fast': 0.0}})

    prio_strike = ['strike_pattern', 'pace']
    feats['strike_heel'] = get_priority_val(user_input, prio_strike, {
        'strike_pattern': {'Heel': 1.0, 'Mid': 0.5, 'Forefoot': 0.0}, 
        'pace': {'Easy': 1.0, 'Steady': 0.5, 'Fast': 0.0}})
    feats['strike_mid'] = get_priority_val(user_input, prio_strike, {
        'strike_pattern': {'Heel': 0.5, 'Mid': 1.0, 'Forefoot': 0.5}, 
        'pace': {'Easy': 0.5, 'Steady': 1.0, 'Fast': 0.5}})
    feats['strike_forefoot'] = get_priority_val(user_input, prio_strike, {
        'strike_pattern': {'Heel': 0.0, 'Mid': 0.0, 'Forefoot': 1.0}, 
        'pace': {'Easy': 0.0, 'Steady': 0.5, 'Fast': 1.0}})

    prio_soft = ['cushion_preferences', 'pace']
    feats['midsole_softness'] = get_priority_val(user_input, prio_soft, {
        'cushion_preferences': {'Soft': 1.0, 'Balanced': 0.6, 'Firm': 0.2}, 
        'pace': {'Easy': 1.0, 'Steady': 0.6, 'Fast': 0.2}})

    prio_width = ['stability_need', 'foot_width']
    feats['width_fit'] = get_priority_val(user_input, prio_width, {
        'stability_need': {'Neutral': 0.5, 'Guided': 0.2}, 
        'foot_width': {'Narrow': 0.2, 'Regular': 0.6, 'Wide': 1}})
    
    feats['toebox_width'] = get_priority_val(user_input, ['stability_need'], 
        {'stability_need': {'Neutral': 0.5, 'Guided': 0.2}})
    
    prio_stiff = ['arch_type', 'pace', 'running_purpose']
    feats['stiffness_scaled'] = get_priority_val(user_input, prio_stiff, {
        'arch_type': {'Flat': 0.0, 'Normal': 0.5, 'High': 0.5}, 
        'pace': {'Easy': 0.2, 'Steady': 0.6, 'Fast': 1.0}, 
        'running_purpose': {'Daily': 0.2, 'Tempo': 0.6, 'Race': 1}})

    prio_tor = ['arch_type', 'pace']
    feats['torsional_rigidity'] = get_priority_val(user_input, prio_tor, {
        'arch_type': {'Flat': 1.0, 'Normal': 0.5, 'High': 0.5}, 
        'pace': {'Easy': 0.2, 'Steady': 0.6, 'Fast': 1.0}})

    feats['heel_stiff'] = get_priority_val(user_input, ['arch_type'], 
        {'arch_type': {'Flat': 1.0, 'Normal': 0.6, 'High': 0.2}})

    prio_plate = ['pace', 'running_purpose']
    feats['plate_rock'] = get_priority_val(user_input, prio_plate, {
        'pace': {'Easy': 0.5, 'Steady': 0.5, 'Fast': 0.5}, 
        'running_purpose': {'Daily': 0.5, 'Tempo': 0.5, 'Race': 0.5}})
    feats['plate_carbon'] = get_priority_val(user_input, prio_plate, {
        'pace': {'Easy': 0.5, 'Steady': 0.5, 'Fast': 1.0}, 
        'running_purpose': {'Daily': 0.5, 'Tempo': 0.5, 'Race': 1.0}})

    prio_stack = ['strike_pattern', 'pace', 'running_purpose']
    feats['heel_lab_mm'] = get_priority_val(user_input, prio_stack, {
        'strike_pattern': {'Heel': 1.0, 'Mid': 0.5, 'Forefoot': 0.0}, 
        'pace': {'Easy': 1.0, 'Steady': 0.5, 'Fast': 0.0}, 
        'running_purpose': {'Daily': 1.0, 'Tempo': 0.5, 'Race': 0.5}})
    feats['forefoot_lab_mm'] = get_priority_val(user_input, prio_stack, {
        'strike_pattern': {'Heel': 0.0, 'Mid': 0.5, 'Forefoot': 1.0}, 
        'pace': {'Easy': 0.0, 'Steady': 0.5, 'Fast': 1.0}, 
        'running_purpose': {'Daily': 1.0, 'Tempo': 0.5, 'Race': 0.5}})

    feats['weight_lab_oz'] = 1.0 - feats['lightweight']
    feats['toebox_durability'] = 1.0
    feats['heel_durability'] = 1.0
    feats['outsole_durability'] = 1.0
    feats['breathability'] = 1.0

    feats['season_summer'] = get_priority_val(user_input, ['season'], 
        {'season': {'Summer': 1.0, 'Spring & Fall': 0.5, 'Winter': 0.0}})
    feats['season_winter'] = get_priority_val(user_input, ['season'], 
        {'season': {'Summer': 0.0, 'Spring & Fall': 0.0, 'Winter': 1.0}})
    feats['season_all'] = get_priority_val(user_input, ['season'], 
        {'season': {'Summer': 0.5, 'Spring & Fall': 1.0, 'Winter': 0.0}})
    
    provided_inputs = {k for k, v in user_input.items() if v}  # Track which inputs user provided
    
    feature_sources = {
        'lightweight': ['pace'], 'rocker': ['running_purpose'], 'removable_insole': ['orthotic_usage'],
        'pace_daily_running': ['running_purpose'], 'pace_tempo': ['running_purpose'], 'pace_competition': ['running_purpose'],
        'arch_neutral': ['arch_type'], 'arch_stability': ['arch_type'],
        'drop_lab_mm': ['pace'],
        'strike_heel': ['strike_pattern', 'pace'], 'strike_mid': ['strike_pattern', 'pace'], 'strike_forefoot': ['strike_pattern', 'pace'],
        'midsole_softness': ['cushion_preferences', 'pace'],
        'width_fit': ['stability_need', 'foot_width'],
        'toebox_width': ['stability_need'],
        'stiffness_scaled': ['arch_type', 'pace', 'running_purpose'],
        'torsional_rigidity': ['arch_type', 'pace'],
        'heel_stiff': ['arch_type'],
        'plate_rock': ['pace', 'running_purpose'], 'plate_carbon': ['pace', 'running_purpose'],
        'heel_lab_mm': ['strike_pattern', 'pace', 'running_purpose'], 
        'forefoot_lab_mm': ['strike_pattern', 'pace', 'running_purpose'],
        'weight_lab_oz': ['pace'],
        'season_summer': ['season'], 'season_winter': ['season'], 'season_all': ['season'],
        'toebox_durability': [], 'heel_durability': [], 'outsole_durability': [], 'breathability': []
    }
    
    all_cols = binary_cols + continuous_cols
    full_vector_raw = []
    for col in binary_cols:
        full_vector_raw.append(feats.get(col, 0.0))
    for col in continuous_cols:
        full_vector_raw.append(feats.get(col, 0.5))

    valid_indices = []
    for i, col in enumerate(all_cols):
        sources = feature_sources.get(col, [])
        if any(src in provided_inputs for src in sources):
            valid_indices.append(i)
            
    if not valid_indices:
        valid_indices = list(range(len(all_cols)))
        
    return full_vector_raw, valid_indices

## Recommendation

In [43]:
def recommend_shoes_deep_masked(user_input, df_data, encoder_model, kmeans_model, binary_cols, continuous_cols, X_combined_data):
    """
    Deep Learning Recommendation Pipeline with Masked Similarity.
    
    Pipeline:
    1. USER PREPROCESSING: Convert user preferences to feature vector with masking
    2. CLUSTER ROUTING: Encode user→latent space, select top K/3 closest clusters
    3. CANDIDATE RANKING: Score shoes via masked cosine similarity
    4. RESULT: Return top 10 ranked recommendations
    
    Masking Benefits:
    - Reduces noise from unanswered questions
    - Focuses similarity on user-provided dimensions only
    - Example: if user only provided 'pace', similarity computed on pace-related features
    
    Args:
        user_input (dict): User questionnaire responses
        df_data (pd.DataFrame): Shoe catalog
        encoder_model: Trained keras encoder
        kmeans_model: Trained KMeans model (K clusters)
        binary_cols (list): Binary feature names
        continuous_cols (list): Continuous feature names
        X_combined_data (np.ndarray): Preprocessed feature matrix (n_shoes, n_features)
    
    Returns:
        pd.DataFrame: Top 10 shoes with index (row number) and match_score, sorted descending
    """
    full_vector, valid_idx = preprocess_user_input_with_mask(user_input, binary_cols, continuous_cols)
    full_vector = np.array([full_vector])

    user_latent = encoder_model.predict(full_vector, verbose=0)
    distances = kmeans_model.transform(user_latent)[0]
    n_select = math.ceil(kmeans_model.n_clusters / 3)  # Select top 1/3 clusters for diversity
    closest_clusters = np.argsort(distances)[:n_select]
    
    print(f"User mapped to Clusters: {closest_clusters}")
    
    candidates = df_data[df_data['cluster'].isin(closest_clusters)].copy()
    if candidates.empty: 
        return pd.DataFrame()
    
    candidate_vectors = X_combined_data[candidates.index]
    
    user_vec_masked = full_vector[:, valid_idx]  # Slice user vector to only relevant features
    cand_vecs_masked = candidate_vectors[:, valid_idx]  # Slice candidate vectors accordingly
    
    if np.all(user_vec_masked == 0):
        scores = np.zeros(len(candidates))
    else:
        scores = cosine_similarity(user_vec_masked, cand_vecs_masked)[0]  # Masked similarity calculation
    
    candidates['match_score'] = scores
    
    # Return: sorted by match_score descending, take top 10, keep only match_score (index included as row identifier)
    return candidates.sort_values('match_score', ascending=False).head(10)[['match_score']]

# Testing
Input options for recommendation engine test cases.

Allows generation of random user preference combinations.

## Define Options

In [44]:
input_options = {
    'running_purpose': ['Daily', 'Tempo', 'Race'],
    'pace': ['Easy', 'Steady', 'Fast'],
    'orthotic_usage': ['Yes', 'No'],
    'arch_type': ['Flat', 'Normal', 'High'],
    'strike_pattern': ['Heel', 'Mid', 'Forefoot'],
    'cushion_preferences': ['Soft', 'Balanced', 'Firm'],
    'foot_width': ['Narrow', 'Regular', 'Wide'],
    'stability_need': ['Neutral', 'Guided'],
    'season': ['Summer', 'Winter', 'Spring & Fall']
}

def generate_random_user_input(num_features):
    """
    Generate randomized user preference input for testing and validation.
    
    Purpose: Creates realistic test cases with variable input completeness.
    
    Args:
        num_features (int): Number of random features to include
    
    Returns:
        dict: User preferences with num_features random keys/values
              e.g., {'pace': 'Fast', 'arch_type': 'Normal', 'season': 'Summer'}
    """
    all_keys = list(input_options.keys())
    selected_keys = random.sample(all_keys, k=min(num_features, len(all_keys)))
    
    user_input = {}
    for key in selected_keys:
        user_input[key] = random.choice(input_options[key])
        
    return user_input

## Execution
Test Suite Execution
Runs recommendation engine on multiple test cases with varying input completeness.

Tests: 3 features (partial), 6 features (moderate), 9 features (complete)

In [47]:
target_counts = [3, 6, 9]

print("=== RECOMMENDATION ENGINE TEST SUITE ===")

for i, count in enumerate(target_counts):
    print(f"\n{'-'*60}")
    print(f"TEST CASE #{i+1}: User providing {count} preferences")
    
    random_input = generate_random_user_input(count)
    print(f"User Input:\n{random_input}")
    
    try:
        recommendations = recommend_shoes_deep_masked(
            random_input, 
            df, 
            encoder, 
            best_model, 
            binary_cols, 
            continuous_cols, 
            X_combined
        )
        
        if not recommendations.empty:
            print("\nTop 10 Recommendations:")
            # Get brand, name, cluster from original df using index, add match_score from recommendations
            result_df = pd.DataFrame({
                'brand': df.loc[recommendations.index, 'brand'].values,
                'name': df.loc[recommendations.index, 'name'].values,
                'match_score': recommendations['match_score'].values,
                'cluster': df.loc[recommendations.index, 'cluster'].values
            })
            display(result_df)
        else:
            print("\nNo recommendations found (cluster empty).")
            
    except NameError:
        print("\nERROR: Ensure model and preprocessing functions are loaded.")
    except Exception as e:
        print(f"\nERROR: {e}")

=== RECOMMENDATION ENGINE TEST SUITE ===

------------------------------------------------------------
TEST CASE #1: User providing 3 preferences
User Input:
{'cushion_preferences': 'Soft', 'strike_pattern': 'Forefoot', 'foot_width': 'Regular'}
User mapped to Clusters: [2 4]

Top 10 Recommendations:


Unnamed: 0,brand,name,match_score,cluster
0,altra,via olympus 2,0.914598,2
1,altra,torin 7,0.911951,2
2,altra,paradigm 7,0.908588,2
3,hoka,gaviota 5,0.907943,2
4,altra,rivera 3,0.907725,2
5,new balance,fresh foam x kaiha road,0.90347,2
6,new balance,fresh foam x 880 v15,0.899275,2
7,new balance,fresh foam x more v6,0.898708,2
8,altra,experience flow,0.897785,2
9,new balance,fresh foam x vongo v6,0.897727,2



------------------------------------------------------------
TEST CASE #2: User providing 6 preferences
User Input:
{'season': 'Spring & Fall', 'pace': 'Fast', 'arch_type': 'Normal', 'running_purpose': 'Race', 'strike_pattern': 'Heel', 'orthotic_usage': 'Yes'}
User mapped to Clusters: [1 4]

Top 10 Recommendations:


Unnamed: 0,brand,name,match_score,cluster
0,brooks,hyperion elite 4,0.888332,1
1,new balance,fuelcell supercomp elite v3,0.880129,1
2,on,cloudboom echo 3,0.880056,1
3,hoka,cielo x1 2.0,0.877701,1
4,brooks,hyperion elite 5,0.865486,1
5,adidas,adizero adios pro 3,0.861127,1
6,saucony,endorphin pro 2,0.860182,1
7,new balance,fuelcell supercomp elite v5,0.860054,1
8,saucony,endorphin pro 4,0.859389,1
9,asics,magic speed 4,0.858192,1



------------------------------------------------------------
TEST CASE #3: User providing 9 preferences
User Input:
{'running_purpose': 'Tempo', 'arch_type': 'High', 'foot_width': 'Regular', 'orthotic_usage': 'Yes', 'season': 'Winter', 'pace': 'Steady', 'strike_pattern': 'Heel', 'stability_need': 'Guided', 'cushion_preferences': 'Firm'}
User mapped to Clusters: [3 1]

Top 10 Recommendations:


Unnamed: 0,brand,name,match_score,cluster
0,under armour,slipspeed mega,0.800023,3
1,brooks,launch 9,0.78873,3
2,puma,deviate nitro elite 3,0.772177,1
3,asics,megablast,0.770719,1
4,adidas,adizero prime x 2 strung,0.766143,1
5,brooks,hyperion elite 4,0.763507,1
6,adidas,adizero prime x3 strung,0.762527,1
7,nike,zoom fly 6,0.761424,1
8,adidas,runfalcon,0.759134,3
9,asics,noosa tri 14,0.758233,3


# Save Artifacts
Saves 4 artifacts for complete model reconstruction:
1. shoe_encoder.keras: Trained autoencoder (feature encoding)
2. kmeans_model.pkl: Trained K-means clusters
3. shoe_metadata.pkl: Complete shoe dataset with cluster assignments
4. shoe_features.pkl: Preprocessed feature matrix (X_combined)

Artifacts stored in timestamped versioned directories for traceability.

In [46]:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
save_dir = f"../../model_artifacts/road/v_{timestamp}"

os.makedirs(save_dir, exist_ok=True)
print(f"Saving models to: {save_dir}")

encoder.save(f'{save_dir}/shoe_encoder.keras')

with open(f'{save_dir}/kmeans_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

df.to_pickle(f'{save_dir}/shoe_metadata.pkl')

with open(f'{save_dir}/shoe_features.pkl', 'wb') as f:
    pickle.dump(X_combined, f)

print("Models saved successfully!")

Saving models to: ../../model_artifacts/road/v_20260214_114821
Models saved successfully!
