In [1]:
import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupKFold
from sklearn.metrics import ndcg_score
import os

print("Libraries imported successfully!")

Libraries imported successfully!


In [2]:
dataset_path = 'data/synthetic_supplier_ranking_dataset.csv'
model_dir = 'models'
model_path = os.path.join(model_dir, 'supplier_ranker_model.txt')

os.makedirs(model_dir, exist_ok=True)

try:
    df = pd.read_csv(dataset_path)
    print("--- Training Data Loaded ---")
    print(f"Dataset shape: {df.shape}")
    display(df.head())
    
    # Data validation
    required_cols = ['price', 'lead_time', 'on_time_delivery_rate', 'quality_acceptance_rate',
                    'defect_rate', 'past_performance_score', 'fulfillment_rate', 
                    'responsiveness_score', 'capacity_reliability_score', 'has_iso_9001',
                    'financial_stability_score', 'geo_risk_score', 'relevance', 'query_id']
    
    missing_cols = [col for col in required_cols if col not in df.columns]
    if missing_cols:
        print(f"⚠️ WARNING: Missing columns: {missing_cols}")
    
    print(f"Number of unique queries: {df['query_id'].nunique()}")
    print(f"Average suppliers per query: {df.groupby('query_id').size().mean():.1f}")
    
except FileNotFoundError:
    print(f"ERROR: Dataset not found at '{dataset_path}'")
    print(f"Current working directory: {os.getcwd()}")

--- Training Data Loaded ---
Dataset shape: (979, 16)


Unnamed: 0,query_id,supplier_id,supplier_name,price,lead_time,on_time_delivery_rate,quality_acceptance_rate,defect_rate,past_performance_score,fulfillment_rate,responsiveness_score,capacity_reliability_score,has_iso_9001,financial_stability_score,geo_risk_score,relevance
0,101,101_1,CoreIndustries,133.87,15,0.963,0.953,0.001,94.9,1.0,4.5,4.8,1,4.4,1,4
1,101,101_2,FocusedTech,186.4,10,0.99,0.995,0.001,91.6,1.0,4.9,5.0,1,4.6,2,4
2,101,101_3,EconoSupply,111.41,19,0.944,0.94,0.034,88.6,0.93,4.4,4.4,0,3.8,3,2
3,101,101_4,StandardSupply Co,141.28,13,0.99,0.995,0.01,91.8,0.998,4.6,4.7,1,4.9,1,4
4,101,101_5,StandardSupply Co,138.23,18,0.952,0.995,0.004,92.2,0.999,4.4,4.8,1,4.7,2,4


Number of unique queries: 150
Average suppliers per query: 6.5


In [3]:
FEATURES = [
    'price', 'lead_time', 'on_time_delivery_rate', 'quality_acceptance_rate',
    'defect_rate', 'past_performance_score', 'fulfillment_rate', 
    'responsiveness_score', 'capacity_reliability_score', 'has_iso_9001',
    'financial_stability_score', 'geo_risk_score'
]
LABEL = 'relevance'
GROUP_ID = 'query_id'

# Prepare features and labels
X = df[FEATURES]
y = df[LABEL]

# CORRECTED: Pass actual group IDs, not group sizes
groups = df[GROUP_ID]

# Additional data validation
print("--- Data Validation ---")
print(f"Features shape: {X.shape}")
print(f"Labels shape: {y.shape}")
print(f"Missing values in features: {X.isnull().sum().sum()}")
print(f"Missing values in labels: {y.isnull().sum()}")
print(f"Label range: [{y.min():.2f}, {y.max():.2f}]")

--- Data Validation ---
Features shape: (979, 12)
Labels shape: (979,)
Missing values in features: 0
Missing values in labels: 0
Label range: [0.00, 4.00]


In [8]:
# Configure the LambdaMART Model parameters
params = {
    'objective': 'lambdarank', 'metric': 'ndcg', 'boosting_type': 'gbdt',
    'n_estimators': 500, 'learning_rate': 0.1, 'num_leaves': 10,
    'min_child_samples': 2, 'max_depth': 5, 'seed': 42,
    'n_jobs': -1, 'verbose': -1,
}

# Use GroupKFold for cross-validation
gkf = GroupKFold(n_splits=5)

models = []
fold_ndcg_scores = []
# --- ADD THIS LINE ---
# Create a list to store the feature importance from each fold
feature_importance_list = []
# --- END OF ADDITION ---

print("\n--- Training with 5-Fold Cross-Validation ---")
for fold, (train_idx, val_idx) in enumerate(gkf.split(X, y, groups=df[GROUP_ID])):
    print(f"\n--- Fold {fold+1}/5 ---")
    
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    train_groups = df.iloc[train_idx].groupby(GROUP_ID).size().to_numpy()
    val_groups = df.iloc[val_idx].groupby(GROUP_ID).size().to_numpy()

    # Initialize and train the LGBMRanker model
    ranker = lgb.LGBMRanker(**params)
    ranker.fit(
        X_train, y_train, group=train_groups,
        eval_set=[(X_val, y_val)], eval_group=[val_groups],
        eval_at=[1, 3],
        callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=True)]
    )
    models.append(ranker)
    
    # --- Evaluation Step ---
    # (This part is the same as before)
    y_pred_scores = ranker.predict(X_val)
    validation_df = df.iloc[val_idx].copy()
    validation_df['pred_score'] = y_pred_scores
    query_ndcg_scores = []
    for query_id in validation_df[GROUP_ID].unique():
        query_df = validation_df[validation_df[GROUP_ID] == query_id]
        true_relevance = [query_df[LABEL].tolist()]
        pred_scores = [query_df['pred_score'].tolist()]
        query_ndcg = ndcg_score(true_relevance, pred_scores, k=3)
        query_ndcg_scores.append(query_ndcg)
    avg_ndcg_for_fold = np.mean(query_ndcg_scores)
    fold_ndcg_scores.append(avg_ndcg_for_fold)
    print(f"Fold {fold+1} Average NDCG@3 Score: {avg_ndcg_for_fold:.4f}")

    # --- NEW: Feature Importance Step ---
    # Get the feature importances from the trained model for this fold
    fold_importance = pd.Series(ranker.feature_importances_, index=FEATURES)
    feature_importance_list.append(fold_importance)
    # --- END OF NEW STEP ---


--- Training with 5-Fold Cross-Validation ---

--- Fold 1/5 ---
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's ndcg@1: 1	valid_0's ndcg@3: 0.993743
Fold 1 Average NDCG@3 Score: 0.9881

--- Fold 2/5 ---
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[2]	valid_0's ndcg@1: 1	valid_0's ndcg@3: 0.984817
Fold 2 Average NDCG@3 Score: 0.9900

--- Fold 3/5 ---
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[13]	valid_0's ndcg@1: 0.982222	valid_0's ndcg@3: 0.980097
Fold 3 Average NDCG@3 Score: 0.9915

--- Fold 4/5 ---
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[2]	valid_0's ndcg@1: 1	valid_0's ndcg@3: 0.988971
Fold 4 Average NDCG@3 Score: 0.9921

--- Fold 5/5 ---
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[2]	valid_0's ndcg@1: 1	val

In [9]:
# Cell 5: Enhanced Evaluation Results
mean_ndcg = np.mean(fold_ndcg_scores)
std_ndcg = np.std(fold_ndcg_scores)

print("\n--- Model Evaluation Summary ---")
print(f"Cross-validation NDCG@3 scores: {[f'{score:.4f}' for score in fold_ndcg_scores]}")
print(f"Average NDCG@3: {mean_ndcg:.4f} (+/- {std_ndcg:.4f})")

# Performance interpretation
if mean_ndcg > 0.95:
    print("\n✅ Excellent! The model has learned ranking patterns very well.")
elif mean_ndcg > 0.85:
    print("\n✅ Good! The model shows solid ranking performance.")
elif mean_ndcg > 0.70:
    print("\n⚠️ Fair. The model is learning but could be improved.")
else:
    print("\n❌ Poor. Consider feature engineering or parameter tuning.")

# Feature Importance Analysis
avg_importance = {}
for feature in FEATURES:
    importance_scores = [fold_imp[feature] for fold_imp in feature_importance_list]
    avg_importance[feature] = np.mean(importance_scores)

print("\n--- Top 5 Most Important Features ---")
sorted_features = sorted(avg_importance.items(), key=lambda x: x[1], reverse=True)
for i, (feature, importance) in enumerate(sorted_features[:5]):
    print(f"{i+1}. {feature}: {importance:.1f}")


--- Model Evaluation Summary ---
Cross-validation NDCG@3 scores: ['0.9881', '0.9900', '0.9915', '0.9921', '0.9889']
Average NDCG@3: 0.9901 (+/- 0.0015)

✅ Excellent! The model has learned ranking patterns very well.

--- Top 5 Most Important Features ---
1. quality_acceptance_rate: 8.2
2. lead_time: 7.0
3. defect_rate: 6.4
4. past_performance_score: 5.8
5. price: 3.4


In [10]:
# Cell 6: Save the Best Model (IMPROVED)
# Select the best performing model instead of just the first one
best_fold_idx = np.argmax(fold_ndcg_scores)
best_model = models[best_fold_idx]

best_model.booster_.save_model(model_path)
print(f"✅ Best model (Fold {best_fold_idx+1}) saved to: '{model_path}'")
print(f"Best model NDCG@3 score: {fold_ndcg_scores[best_fold_idx]:.4f}")


✅ Best model (Fold 4) saved to: 'models\supplier_ranker_model.txt'
Best model NDCG@3 score: 0.9921


In [14]:
# --- Step 1: Simulate a New Purchase Request ---
new_request_data = {
    'supplier_name': ['Mahindra Group', 'CI Car International', 'Michelin', 'Bosch Group'],
    'price': [148, 155, 160, 162],
    'lead_time': [22, 15, 8, 12],
    'on_time_delivery_rate': [0.93, 0.98, 0.97, 0.99],
    'quality_acceptance_rate': [0.96, 0.99, 0.985, 0.995],
    'defect_rate': [0.04, 0.01, 0.015, 0.005],
    'past_performance_score': [88, 95, 96, 98],
    'fulfillment_rate': [0.97, 0.99, 0.99, 1.0],
    'responsiveness_score': [4.4, 4.8, 4.7, 4.9],
    'capacity_reliability_score': [4.5, 4.7, 4.8, 4.9],
    # --- THIS IS THE FIX ---
    'has_iso_9001': [1, 1, 1, 1], # Corrected from two underscores to one
    # --- END OF FIX ---
    'financial_stability_score': [4.6, 4.9, 4.9, 4.9],
    'geo_risk_score': [1, 1, 2, 2]
}
candidates_df = pd.DataFrame(new_request_data)
print("--- New Candidates to Rank ---")
display(candidates_df)


# --- Step 2: Load the Saved Model ---
print(f"\n--- Loading model from: {model_path} ---")
try:
    ranker_model = lgb.Booster(model_file=model_path)
    print("✅ Model loaded successfully.")
except lgb.basic.LightGBMError:
    print(f"❌ ERROR: Model file not found at '{model_path}'. Please run the training and saving cells first.")
    raise

    
# --- Step 3: Prepare the New Data for Prediction ---
# The features must be in the exact same order as they were during training.
X_new = candidates_df[FEATURES]


# --- Step 4: Make Predictions ---
predicted_scores = ranker_model.predict(X_new)
candidates_df['predicted_score'] = predicted_scores


# --- Step 5: Show the Final, Ranked List ---
final_ranking = candidates_df.sort_values(by='predicted_score', ascending=False)

print("\n--- Final Ranked List (Top Recommendation First) ---")
display(final_ranking)

--- New Candidates to Rank ---


Unnamed: 0,supplier_name,price,lead_time,on_time_delivery_rate,quality_acceptance_rate,defect_rate,past_performance_score,fulfillment_rate,responsiveness_score,capacity_reliability_score,has_iso_9001,financial_stability_score,geo_risk_score
0,Mahindra Group,148,22,0.93,0.96,0.04,88,0.97,4.4,4.5,1,4.6,1
1,CI Car International,155,15,0.98,0.99,0.01,95,0.99,4.8,4.7,1,4.9,1
2,Michelin,160,8,0.97,0.985,0.015,96,0.99,4.7,4.8,1,4.9,2
3,Bosch Group,162,12,0.99,0.995,0.005,98,1.0,4.9,4.9,1,4.9,2



--- Loading model from: models\supplier_ranker_model.txt ---
✅ Model loaded successfully.

--- Final Ranked List (Top Recommendation First) ---


Unnamed: 0,supplier_name,price,lead_time,on_time_delivery_rate,quality_acceptance_rate,defect_rate,past_performance_score,fulfillment_rate,responsiveness_score,capacity_reliability_score,has_iso_9001,financial_stability_score,geo_risk_score,predicted_score
1,CI Car International,155,15,0.98,0.99,0.01,95,0.99,4.8,4.7,1,4.9,1,0.358418
3,Bosch Group,162,12,0.99,0.995,0.005,98,1.0,4.9,4.9,1,4.9,2,0.358418
2,Michelin,160,8,0.97,0.985,0.015,96,0.99,4.7,4.8,1,4.9,2,0.227027
0,Mahindra Group,148,22,0.93,0.96,0.04,88,0.97,4.4,4.5,1,4.6,1,-0.244231
