In [1]:
import os
import numpy as np
import pandas as pd

# ============================
# CONFIG
# ============================
np.random.seed(42)
N_SAMPLES = 5000

OUTPUT_DIR = "/Users/prajitbaskaran/Downloads/synthetic_data"
OUTPUT_FILE = "swiggy_delivery_synthetic.csv"

# Pricing coefficients (hidden from model)
RATE_PER_KM = 8.0
RATE_PER_MIN = 1.0
ALPHA = 10.0
MAX_SURGE = 50.0

# Noise level
NOISE_STD = 3.0  # rupees

# ============================
# CREATE OUTPUT DIRECTORY
# ============================
os.makedirs(OUTPUT_DIR, exist_ok=True)

# ============================
# SYNTHETIC DATA GENERATION
# ============================

distance_km = np.random.uniform(0.5, 10.0, N_SAMPLES)
time_min = np.random.uniform(5, 40, N_SAMPLES)

active_orders = np.random.randint(50, 400, N_SAMPLES)
available_partners = np.random.randint(40, 200, N_SAMPLES)

base_fee = np.random.choice([15, 20, 25, 30], N_SAMPLES)
promo_discount = np.random.choice(
    [0, 10, 20, 30],
    N_SAMPLES,
    p=[0.5, 0.2, 0.2, 0.1]
)

# Demand index
demand_index = active_orders / available_partners

# Surge fee (capped)
surge_fee = np.minimum(ALPHA * demand_index, MAX_SURGE)

# Noise
noise = np.random.normal(0, NOISE_STD, N_SAMPLES)

# Final delivery fee
delivery_fee = (
    base_fee
    + (distance_km * RATE_PER_KM)
    + (time_min * RATE_PER_MIN)
    + surge_fee
    - promo_discount
    + noise
)

delivery_fee = np.clip(delivery_fee, 20, None)

# ============================
# CREATE DATAFRAME
# ============================

df = pd.DataFrame({
    "distance_km": distance_km,
    "time_min": time_min,
    "active_orders": active_orders,
    "available_partners": available_partners,
    "base_fee": base_fee,
    "promo_discount": promo_discount,
    "delivery_fee": delivery_fee
})

# ============================
# SAVE DATASET
# ============================

output_path = os.path.join(OUTPUT_DIR, OUTPUT_FILE)
df.to_csv(output_path, index=False)

print("‚úÖ Dataset saved successfully!")
print("üìÅ Location:", output_path)
print("üìä Shape:", df.shape)


‚úÖ Dataset saved successfully!
üìÅ Location: /Users/prajitbaskaran/Downloads/synthetic_data/swiggy_delivery_synthetic.csv
üìä Shape: (5000, 7)


In [3]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# ============================
# LOAD DATASET
# ============================
DATA_PATH = "/Users/prajitbaskaran/Downloads/synthetic_data/swiggy_delivery_synthetic.csv"
df = pd.read_csv(DATA_PATH)

# ============================
# FEATURE ENGINEERING
# ============================
df["demand_index"] = df["active_orders"] / df["available_partners"]

X = df[
    ["distance_km", "time_min", "demand_index", "promo_discount", "base_fee"]
]
y = df["delivery_fee"]

# ============================
# TRAIN / TEST SPLIT
# ============================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ============================
# TRAIN INTERPRETABLE MODEL
# ============================
model = Ridge(alpha=1.0)
model.fit(X_train, y_train)

print("‚úÖ Model trained")
print("üìà Test R¬≤:", round(r2_score(y_test, model.predict(X_test)), 3))

# ============================
# EXTRACT ADAPTABLE FORMULA
# ============================
print("\nüîç Learned Adaptable Formula:\n")

print(f"Base Intercept: ‚Çπ{model.intercept_:.2f}")

for name, coef in zip(X.columns, model.coef_):
    sign = "+" if coef >= 0 else "-"
    print(f"{sign} ‚Çπ{abs(coef):.2f} √ó {name}")

# ============================
# TRANSPARENT EXPLANATION FUNCTION
# ============================
def explain_order(row, model):
    contributions = {
        "Base intercept": model.intercept_,
        "Distance cost": model.coef_[0] * row["distance_km"],
        "Time cost": model.coef_[1] * row["time_min"],
        "Demand surge": model.coef_[2] * row["demand_index"],
        "Promotion effect": model.coef_[3] * row["promo_discount"],
        "Base fee policy": model.coef_[4] * row["base_fee"],
    }

    total = sum(contributions.values())

    explanation = pd.DataFrame.from_dict(
        contributions, orient="index", columns=["‚Çπ Contribution"]
    )
    explanation["‚Çπ Contribution"] = explanation["‚Çπ Contribution"].round(2)

    return explanation, round(total, 2)

# ============================
# EXPLAIN ONE SAMPLE ORDER
# ============================
sample = X_test.iloc[0]
explanation, predicted_fee = explain_order(sample, model)

print("\nüßæ Transparent Explanation for One Order:\n")
print(explanation)
print("\n‚û°Ô∏è Predicted Delivery Fee: ‚Çπ", predicted_fee)


‚úÖ Model trained
üìà Test R¬≤: 0.979

üîç Learned Adaptable Formula:

Base Intercept: ‚Çπ4.14
+ ‚Çπ7.98 √ó distance_km
+ ‚Çπ0.99 √ó time_min
+ ‚Çπ7.84 √ó demand_index
- ‚Çπ1.01 √ó promo_discount
+ ‚Çπ1.01 √ó base_fee

üßæ Transparent Explanation for One Order:

                  ‚Çπ Contribution
Base intercept              4.14
Distance cost              40.31
Time cost                   7.70
Demand surge               19.84
Promotion effect          -30.22
Base fee policy            15.15

‚û°Ô∏è Predicted Delivery Fee: ‚Çπ 56.91


In [4]:
# ============================================================
# ADAPTABLE TRANSPARENT PRICING MODEL
# + SHAP-GUIDED RL POLICY
# + PKL SAVING
# (ONE CELL ‚Äì HACKATHON READY)
# ============================================================

import pandas as pd
import numpy as np
import pickle

from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

import shap

# ============================================================
# 1Ô∏è‚É£ LOAD DATA
# ============================================================

DATA_PATH = "/Users/prajitbaskaran/Downloads/synthetic_data/swiggy_delivery_synthetic.csv" # change if needed
df = pd.read_csv(DATA_PATH)

# ============================================================
# 2Ô∏è‚É£ FEATURE ENGINEERING
# ============================================================

df["demand_index"] = df["active_orders"] / df["available_partners"]

FEATURES = [
    "distance_km",
    "time_min",
    "demand_index",
    "promo_discount",
    "base_fee"
]

X = df[FEATURES]
y = df["delivery_fee"]

# ============================================================
# 3Ô∏è‚É£ TRAIN / TEST SPLIT
# ============================================================

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ============================================================
# 4Ô∏è‚É£ TRAIN TRANSPARENT MODEL
# ============================================================

model = Ridge(alpha=1.0)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("‚úÖ Model trained")
print("üìà Test R¬≤:", round(r2_score(y_test, y_pred), 3))

# ============================================================
# 5Ô∏è‚É£ SHAP ANALYSIS (OFFLINE DIAGNOSTICS)
# ============================================================

background = X_train.sample(200, random_state=42)

explainer = shap.Explainer(model, background)
shap_values = explainer(X_test)

mean_abs_shap = np.abs(shap_values.values).mean(axis=0)

shap_df = pd.DataFrame({
    "feature": FEATURES,
    "mean_abs_shap": mean_abs_shap
}).sort_values("mean_abs_shap", ascending=False)

print("\nüîç SHAP Feature Importance (Mean |SHAP|):")
print(shap_df)

# ============================================================
# 6Ô∏è‚É£ SHAP-GUIDED RL POLICY (RETRAIN DECISION)
# ============================================================

# ---- Signals for RL ----
mse_error = np.mean((y_test - y_pred) ** 2)
shap_drift = shap_df["mean_abs_shap"].std()

# ---- Discrete State ----
state = (
    int(mse_error > 25),     # high error?
    int(shap_drift > 5)      # unstable importance?
)

# ---- Q-table (small & interpretable) ----
Q = {
    (0, 0): [0.0, 0.0],
    (0, 1): [0.0, 0.0],
    (1, 0): [0.0, 0.0],
    (1, 1): [0.0, 0.0]
}

# ---- Reward Function ----
def reward(state, action):
    error_flag, shap_flag = state

    if action == 1 and (error_flag or shap_flag):
        return +5   # retrain when needed
    if action == 1 and not (error_flag or shap_flag):
        return -3   # unnecessary retrain
    if action == 0 and (error_flag or shap_flag):
        return -5   # ignored retrain
    return +2       # stable decision

# ---- One-step Q-learning (demo-level) ----
alpha = 0.5

for action in [0, 1]:
    r = reward(state, action)
    Q[state][action] += alpha * (r - Q[state][action])

chosen_action = int(np.argmax(Q[state]))

print("\nü§ñ SHAP-Guided RL Decision")
print("State (HighError, HighSHAPDrift):", state)
print("Action:", "RETRAIN NEXT CYCLE" if chosen_action == 1 else "KEEP CURRENT MODEL")

# ============================================================
# 7Ô∏è‚É£ PACKAGE MODEL FOR DEPLOYMENT
# ============================================================

model_package = {
    "model": model,
    "features": FEATURES,
    "intercept": model.intercept_,
    "coefficients": dict(zip(FEATURES, model.coef_)),
    "shap_importance": shap_df,
    "rl_state": state,
    "rl_decision": "RETRAIN" if chosen_action == 1 else "NO_RETRAIN",
    "version": "v1.0",
    "description": "Adaptable Transparent Pricing Model with SHAP-guided RL"
}

# ============================================================
# 8Ô∏è‚É£ SAVE AS PKL
# ============================================================

PKL_PATH = "transparent_pricing_model.pkl"

with open(PKL_PATH, "wb") as f:
    pickle.dump(model_package, f)

print("\nüíæ Model saved successfully!")
print("üì¶ File:", PKL_PATH)


IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html


‚úÖ Model trained
üìà Test R¬≤: 0.979

üîç SHAP Feature Importance (Mean |SHAP|):
          feature  mean_abs_shap
0     distance_km      18.873587
2    demand_index       9.247385
3  promo_discount       9.142455
1        time_min       8.442279
4        base_fee       5.135459

ü§ñ SHAP-Guided RL Decision
State (HighError, HighSHAPDrift): (0, 1)
Action: RETRAIN NEXT CYCLE

üíæ Model saved successfully!
üì¶ File: transparent_pricing_model.pkl


In [5]:
# ============================================================
# END-TO-END DEMO:
# TRANSPARENT PRICING + POLICY ADAPTATION
# (ONE CELL ‚Äì HACKATHON READY)
# ============================================================

import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# ============================================================
# 1Ô∏è‚É£ LOAD DATA
# ============================================================

df = pd.read_csv("/Users/prajitbaskaran/Downloads/synthetic_data/swiggy_delivery_synthetic.csv")

# ============================================================
# 2Ô∏è‚É£ SIMULATE TIME (WEEKS)
# ============================================================

ORDERS_PER_WEEK = 500
df = df.reset_index(drop=True)
df["week"] = df.index // ORDERS_PER_WEEK

# ============================================================
# 3Ô∏è‚É£ SIMULATE REAL-WORLD SHIFTS (POLICY PRESSURE)
# ============================================================

# Week ‚â•2 ‚Üí traffic worsens
df.loc[df["week"] >= 2, "time_min"] *= 1.3

# Week ‚â•3 ‚Üí demand surge
df.loc[df["week"] >= 3, "active_orders"] *= 1.5

# Recompute demand index
df["demand_index"] = df["active_orders"] / df["available_partners"]

FEATURES = [
    "distance_km",
    "time_min",
    "demand_index",
    "promo_discount",
    "base_fee"
]

# ============================================================
# 4Ô∏è‚É£ FIX ONE ORDER TO TRACK ACROSS TIME
# ============================================================

fixed_order = {
    "distance_km": 5.0,
    "time_min": 10.0,
    "active_orders": 200,
    "available_partners": 100,
    "promo_discount": 20,
    "base_fee": 20
}

print("\nüì¶ Tracking SAME ORDER across weeks")
print(fixed_order)

# ============================================================
# 5Ô∏è‚É£ WEEK-BY-WEEK TRAINING (POLICY ADAPTATION)
# ============================================================

policy_history = []

for week in sorted(df["week"].unique()):
    train_df = df[df["week"] <= week]

    X = train_df[FEATURES]
    y = train_df["delivery_fee"]

    model = Ridge(alpha=1.0)
    model.fit(X, y)

    # Store policy (coefficients)
    policy_history.append({
        "week": week,
        "distance_beta": model.coef_[0],
        "time_beta": model.coef_[1],
        "demand_beta": model.coef_[2],
        "promo_beta": model.coef_[3],
        "base_fee_beta": model.coef_[4],
        "intercept": model.intercept_
    })

    # ---- Explain SAME order under this week's policy ----
    demand_index = fixed_order["active_orders"] / fixed_order["available_partners"]

    fee = (
        model.intercept_
        + model.coef_[0] * fixed_order["distance_km"]
        + model.coef_[1] * fixed_order["time_min"]
        + model.coef_[2] * demand_index
        - model.coef_[3] * fixed_order["promo_discount"]
        + model.coef_[4] * fixed_order["base_fee"]
    )

    print(f"\nüóìÔ∏è Week {week} Policy")
    print(f"Distance Œ≤: {model.coef_[0]:.2f}")
    print(f"Time Œ≤:     {model.coef_[1]:.2f}")
    print(f"Demand Œ≤:   {model.coef_[2]:.2f}")
    print(f"‚û°Ô∏è Fee for SAME order: ‚Çπ{fee:.2f}")

# ============================================================
# 6Ô∏è‚É£ SHOW POLICY CHANGE CLEARLY
# ============================================================

policy_df = pd.DataFrame(policy_history)

print("\nüìä POLICY EVOLUTION OVER TIME")
print(policy_df[[
    "week",
    "distance_beta",
    "time_beta",
    "demand_beta"
]])

# ============================================================
# 7Ô∏è‚É£ FINAL MESSAGE (WHAT YOU SAY TO JUDGES)
# ============================================================

print("""
‚úÖ DEMO SUMMARY:
- Pricing formula NEVER changed
- Only coefficients adapted over time
- Same order ‚Üí different fee ‚Üí fully explainable
- Policy adapts to traffic & demand shifts
""")



üì¶ Tracking SAME ORDER across weeks
{'distance_km': 5.0, 'time_min': 10.0, 'active_orders': 200, 'available_partners': 100, 'promo_discount': 20, 'base_fee': 20}

üóìÔ∏è Week 0 Policy
Distance Œ≤: 7.89
Time Œ≤:     1.00
Demand Œ≤:   8.05
‚û°Ô∏è Fee for SAME order: ‚Çπ109.35

üóìÔ∏è Week 1 Policy
Distance Œ≤: 7.99
Time Œ≤:     1.00
Demand Œ≤:   8.03
‚û°Ô∏è Fee for SAME order: ‚Çπ109.53

üóìÔ∏è Week 2 Policy
Distance Œ≤: 7.93
Time Œ≤:     0.82
Demand Œ≤:   7.84
‚û°Ô∏è Fee for SAME order: ‚Çπ110.07

üóìÔ∏è Week 3 Policy
Distance Œ≤: 7.92
Time Œ≤:     0.74
Demand Œ≤:   6.18
‚û°Ô∏è Fee for SAME order: ‚Çπ109.09

üóìÔ∏è Week 4 Policy
Distance Œ≤: 7.92
Time Œ≤:     0.72
Demand Œ≤:   5.56
‚û°Ô∏è Fee for SAME order: ‚Çπ108.22

üóìÔ∏è Week 5 Policy
Distance Œ≤: 7.94
Time Œ≤:     0.72
Demand Œ≤:   5.38
‚û°Ô∏è Fee for SAME order: ‚Çπ107.24

üóìÔ∏è Week 6 Policy
Distance Œ≤: 7.92
Time Œ≤:     0.72
Demand Œ≤:   5.29
‚û°Ô∏è Fee for SAME order: ‚Çπ106.73

üóìÔ∏è Week 7 Policy
Distance Œ≤: 7

Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value '[310.5 573.  384.  ... 546.  475.5 453. ]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.


In [6]:
# ============================================================
# END-TO-END OFFLINE PIPELINE
# Transparent Pricing + Policy Adaptation
# Final output: PKL file
# ============================================================

import pandas as pd
import numpy as np
import pickle
from sklearn.linear_model import Ridge

# ============================================================
# 1Ô∏è‚É£ LOAD DATA
# ============================================================

DATA_PATH = "/Users/prajitbaskaran/Downloads/synthetic_data/swiggy_delivery_synthetic.csv"  # your dataset
df = pd.read_csv(DATA_PATH).reset_index(drop=True)

# ============================================================
# 2Ô∏è‚É£ SIMULATE TIME (WEEKS)
# ============================================================

ORDERS_PER_WEEK = 500
df["week"] = df.index // ORDERS_PER_WEEK

# ============================================================
# 3Ô∏è‚É£ SIMULATE REAL-WORLD SHIFTS (CONCEPT DRIFT)
# ============================================================

# Traffic increase after week 2
df.loc[df["week"] >= 2, "time_min"] *= 1.3

# Demand surge after week 3
df.loc[df["week"] >= 3, "active_orders"] *= 1.5

# ============================================================
# 4Ô∏è‚É£ FEATURE ENGINEERING
# ============================================================

df["demand_index"] = df["active_orders"] / df["available_partners"]

FEATURES = [
    "distance_km",
    "time_min",
    "demand_index",
    "promo_discount",
    "base_fee"
]

TARGET = "delivery_fee"

# ============================================================
# 5Ô∏è‚É£ POLICY ADAPTATION LOOP (SIMULATED RL CONTROL)
# ============================================================

policy_versions = []
current_model = None
policy_version_id = 0

for week in sorted(df["week"].unique()):
    train_df = df[df["week"] <= week]

    X = train_df[FEATURES]
    y = train_df[TARGET]

    # -------- Simulated RL decision --------
    # (For demo: retrain when enough new data arrives)
    retrain_required = (
        current_model is None or
        week in [0, 2, 3]  # pretend RL triggered retrain here
    )

    if retrain_required:
        model = Ridge(alpha=1.0)
        model.fit(X, y)
        current_model = model
        policy_version_id += 1

        print(f"üîÅ Retraining triggered at week {week} ‚Üí Policy v{policy_version_id}")

    policy_versions.append({
        "week": week,
        "policy_version": policy_version_id,
        "distance_beta": current_model.coef_[0],
        "time_beta": current_model.coef_[1],
        "demand_beta": current_model.coef_[2],
        "promo_beta": current_model.coef_[3],
        "base_fee_beta": current_model.coef_[4],
        "intercept": current_model.intercept_
    })

# ============================================================
# 6Ô∏è‚É£ FINAL POLICY (DEPLOYMENT VERSION)
# ============================================================

final_policy = policy_versions[-1]

print("\n‚úÖ FINAL DEPLOYED POLICY")
print(final_policy)

# ============================================================
# 7Ô∏è‚É£ PACKAGE MODEL FOR BACKEND USE
# ============================================================

model_package = {
    "model": current_model,
    "features": FEATURES,
    "intercept": current_model.intercept_,
    "coefficients": dict(zip(FEATURES, current_model.coef_)),
    "policy_version": final_policy["policy_version"],
    "training_weeks_used": final_policy["week"],
    "description": "Adaptable Transparent Pricing Model (Policy-Based)"
}

# ============================================================
# 8Ô∏è‚É£ SAVE AS PKL
# ============================================================

PKL_PATH = "transparent_pricing_model.pkl"

with open(PKL_PATH, "wb") as f:
    pickle.dump(model_package, f)

print("\nüíæ Model successfully saved!")
print("üì¶ File:", PKL_PATH)


üîÅ Retraining triggered at week 0 ‚Üí Policy v1
üîÅ Retraining triggered at week 2 ‚Üí Policy v2
üîÅ Retraining triggered at week 3 ‚Üí Policy v3

‚úÖ FINAL DEPLOYED POLICY
{'week': 9, 'policy_version': 3, 'distance_beta': 7.918838267228285, 'time_beta': 0.7384516756522838, 'demand_beta': 6.176550819900027, 'promo_beta': -0.9945101282833665, 'base_fee_beta': 0.9743284407312295, 'intercept': 10.378113977594438}

üíæ Model successfully saved!
üì¶ File: transparent_pricing_model.pkl


Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value '[310.5 573.  384.  ... 546.  475.5 453. ]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.


In [7]:
import pandas as pd
import numpy as np
import pickle
import os
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# ============================
# PATH CONFIG (HARDCODED)
# ============================
DATA_PATH = "/Users/prajitbaskaran/Downloads/synthetic_data/swiggy_delivery_synthetic.csv"
OUTPUT_DIR = "/Users/prajitbaskaran/Downloads/synthetic_data/output"
PKL_PATH = os.path.join(OUTPUT_DIR, "transparent_pricing_model.pkl")

# Create output directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)

# ============================
# LOAD DATASET
# ============================
df = pd.read_csv(DATA_PATH)

# ============================
# FEATURE ENGINEERING
# ============================
df["demand_index"] = df["active_orders"] / df["available_partners"]

FEATURES = [
    "distance_km",
    "time_min",
    "demand_index",
    "promo_discount",
    "base_fee",
]

X = df[FEATURES]
y = df["delivery_fee"]

# ============================
# TRAIN / TEST SPLIT
# ============================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ============================
# TRAIN INTERPRETABLE MODEL
# ============================
model = Ridge(alpha=1.0)
model.fit(X_train, y_train)

print("‚úÖ Model trained")
print("üìà Test R¬≤:", round(r2_score(y_test, model.predict(X_test)), 3))

# ============================
# EXTRACT ADAPTABLE FORMULA
# ============================
print("\nüîç Learned Adaptable Formula:\n")
print(f"Base Intercept: ‚Çπ{model.intercept_:.2f}")

for name, coef in zip(FEATURES, model.coef_):
    sign = "+" if coef >= 0 else "-"
    print(f"{sign} ‚Çπ{abs(coef):.2f} √ó {name}")

# ============================
# TRANSPARENT EXPLANATION FUNCTION
# ============================
def explain_order(row, model):
    contributions = {
        "Base intercept": model.intercept_,
        "Distance cost": model.coef_[0] * row["distance_km"],
        "Time cost": model.coef_[1] * row["time_min"],
        "Demand surge": model.coef_[2] * row["demand_index"],
        "Promotion effect": model.coef_[3] * row["promo_discount"],
        "Base fee policy": model.coef_[4] * row["base_fee"],
    }

    total = sum(contributions.values())

    explanation = pd.DataFrame.from_dict(
        contributions, orient="index", columns=["‚Çπ Contribution"]
    )
    explanation["‚Çπ Contribution"] = explanation["‚Çπ Contribution"].round(2)

    return explanation, round(total, 2)

# ============================
# EXPLAIN ONE SAMPLE ORDER
# ============================
sample = X_test.iloc[0]
explanation, predicted_fee = explain_order(sample, model)

print("\nüßæ Transparent Explanation for One Order:\n")
print(explanation)
print("\n‚û°Ô∏è Predicted Delivery Fee: ‚Çπ", predicted_fee)

# ============================
# SAVE MODEL AS PKL (BACKEND READY)
# ============================
model_package = {
    "model": model,
    "features": FEATURES,
    "intercept": model.intercept_,
    "coefficients": dict(zip(FEATURES, model.coef_)),
    "description": "Data-driven transparent delivery pricing model",
}

with open(PKL_PATH, "wb") as f:
    pickle.dump(model_package, f)

print("\nüíæ Model saved successfully!")
print("üì¶ File:", PKL_PATH)


‚úÖ Model trained
üìà Test R¬≤: 0.979

üîç Learned Adaptable Formula:

Base Intercept: ‚Çπ4.14
+ ‚Çπ7.98 √ó distance_km
+ ‚Çπ0.99 √ó time_min
+ ‚Çπ7.84 √ó demand_index
- ‚Çπ1.01 √ó promo_discount
+ ‚Çπ1.01 √ó base_fee

üßæ Transparent Explanation for One Order:

                  ‚Çπ Contribution
Base intercept              4.14
Distance cost              40.31
Time cost                   7.70
Demand surge               19.84
Promotion effect          -30.22
Base fee policy            15.15

‚û°Ô∏è Predicted Delivery Fee: ‚Çπ 56.91

üíæ Model saved successfully!
üì¶ File: /Users/prajitbaskaran/Downloads/synthetic_data/output/transparent_pricing_model.pkl


In [8]:
import pandas as pd
import numpy as np
import os

# ============================
# PATHS
# ============================
INPUT_PATH = "/Users/prajitbaskaran/Downloads/synthetic_data/swiggy_delivery_synthetic.csv"
OUTPUT_DIR = "/Users/prajitbaskaran/Downloads/synthetic_data/output"
OUTPUT_PATH = os.path.join(OUTPUT_DIR, "swiggy_delivery_with_extra_features.csv")

os.makedirs(OUTPUT_DIR, exist_ok=True)

# ============================
# LOAD ORIGINAL DATASET
# ============================
df = pd.read_csv(INPUT_PATH)

np.random.seed(42)
n = len(df)

# ============================
# ADD LOW-IMPORTANCE FEATURES
# ============================

# Pure noise (no signal)
df["noise_feature_1"] = np.random.normal(0, 1, n)
df["noise_feature_2"] = np.random.uniform(0, 1, n)

# Random categorical-like numeric codes
df["area_code_id"] = np.random.randint(100, 999, n)
df["app_version"] = np.random.randint(1, 15, n)

# Weak / irrelevant binary flags
df["is_weekend"] = np.random.choice([0, 1], n)
df["is_raining"] = np.random.choice([0, 1], n)

# Random ratings (intentionally noisy)
df["restaurant_rating"] = np.random.uniform(2.5, 5.0, n)
df["delivery_partner_rating"] = np.random.uniform(3.0, 5.0, n)

# Arbitrary operational metadata
df["screen_time_sec"] = np.random.uniform(5, 120, n)
df["ui_clicks"] = np.random.randint(1, 20, n)

# ============================
# SAVE AUGMENTED DATASET
# ============================
df.to_csv(OUTPUT_PATH, index=False)

print("‚úÖ Dataset augmented with low-importance features")
print("üì¶ Saved at:", OUTPUT_PATH)
print("\nüßæ New columns added:")
print([
    "noise_feature_1",
    "noise_feature_2",
    "area_code_id",
    "app_version",
    "is_weekend",
    "is_raining",
    "restaurant_rating",
    "delivery_partner_rating",
    "screen_time_sec",
    "ui_clicks",
])


‚úÖ Dataset augmented with low-importance features
üì¶ Saved at: /Users/prajitbaskaran/Downloads/synthetic_data/output/swiggy_delivery_with_extra_features.csv

üßæ New columns added:
['noise_feature_1', 'noise_feature_2', 'area_code_id', 'app_version', 'is_weekend', 'is_raining', 'restaurant_rating', 'delivery_partner_rating', 'screen_time_sec', 'ui_clicks']


In [None]:
/Users/prajitbaskaran/Downloads/synthetic_data/output

In [10]:
import pandas as pd
import os

# ============================
# PATHS
# ============================
INPUT_PATH = "/Users/prajitbaskaran/Downloads/synthetic_data/output/swiggy_delivery_with_extra_features.csv"
OUTPUT_DIR = "/Users/prajitbaskaran/Downloads/synthetic_data/output"
OUTPUT_PATH = os.path.join(OUTPUT_DIR, "swiggy_delivery_reordered.csv")

os.makedirs(OUTPUT_DIR, exist_ok=True)

# ============================
# LOAD DATASET
# ============================
df = pd.read_csv(INPUT_PATH)

LABEL_COL = "delivery_fee"

# ============================
# REORDER COLUMNS
# ============================
feature_cols = [col for col in df.columns if col != LABEL_COL]
df = df[feature_cols + [LABEL_COL]]

# ============================
# SAVE REORDERED DATASET
# ============================
df.to_csv(OUTPUT_PATH, index=False)

print("‚úÖ Columns reordered successfully")
print("üì¶ Label column moved to the end")
print("üìÑ Saved at:", OUTPUT_PATH)

print("\nüßæ Final column order:")
print(df.columns.tolist())


‚úÖ Columns reordered successfully
üì¶ Label column moved to the end
üìÑ Saved at: /Users/prajitbaskaran/Downloads/synthetic_data/output/swiggy_delivery_reordered.csv

üßæ Final column order:
['distance_km', 'time_min', 'active_orders', 'available_partners', 'base_fee', 'promo_discount', 'noise_feature_1', 'noise_feature_2', 'area_code_id', 'app_version', 'is_weekend', 'is_raining', 'restaurant_rating', 'delivery_partner_rating', 'screen_time_sec', 'ui_clicks', 'delivery_fee']


In [11]:
import pandas as pd
import numpy as np
import shap
import os

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score


INPUT_PATH = "/Users/prajitbaskaran/Downloads/synthetic_data/output/swiggy_delivery_reordered.csv"
OUTPUT_DIR = "/Users/prajitbaskaran/Downloads/synthetic_data/output"

SHAP_RANK_PATH = os.path.join(OUTPUT_DIR, "shap_feature_ranking.csv")
PERF_PATH = os.path.join(OUTPUT_DIR, "rf_performance_by_features.csv")
TOP6_DATASET_PATH = os.path.join(OUTPUT_DIR, "top6_features_dataset.csv")

os.makedirs(OUTPUT_DIR, exist_ok=True)


df = pd.read_csv(INPUT_PATH)

LABEL_COL = "delivery_fee"
X = df.drop(columns=[LABEL_COL])
y = df[LABEL_COL]


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


rf = RandomForestRegressor(
    n_estimators=200,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)


explainer = shap.TreeExplainer(rf)
shap_values = explainer.shap_values(X_train)

# Mean absolute SHAP per feature
mean_abs_shap = np.abs(shap_values).mean(axis=0)

shap_importance = pd.DataFrame({
    "feature": X.columns,
    "mean_abs_shap": mean_abs_shap
}).sort_values(by="mean_abs_shap", ascending=False)

# Save SHAP ranking
shap_importance.to_csv(SHAP_RANK_PATH, index=False)

print("‚úÖ SHAP feature ranking computed")
print("üìÑ Saved at:", SHAP_RANK_PATH)


results = []
top_features = shap_importance["feature"].tolist()

for k in range(1, 7):  # Top 1 ‚Üí Top 6
    selected_features = top_features[:k]

    X_train_k = X_train[selected_features]
    X_test_k = X_test[selected_features]

    rf_k = RandomForestRegressor(
        n_estimators=200,
        random_state=42,
        n_jobs=-1
    )
    rf_k.fit(X_train_k, y_train)

    y_pred = rf_k.predict(X_test_k)
    r2 = r2_score(y_test, y_pred)

    results.append({
        "num_features": k,
        "features_used": selected_features,
        "r2_score": round(r2, 4)
    })

    print(f"üìà Top-{k} features RF R¬≤: {r2:.4f}")

# Save performance results
perf_df = pd.DataFrame(results)
perf_df.to_csv(PERF_PATH, index=False)

print("\n‚úÖ RF performance tracking saved")
print("üìÑ Saved at:", PERF_PATH)


final_features = top_features[:6]
final_df = df[final_features + [LABEL_COL]]

final_df.to_csv(TOP6_DATASET_PATH, index=False)

print("\n‚úÖ Top-6 feature dataset saved")
print("üì¶ File:", TOP6_DATASET_PATH)

print("\nüèÅ FINAL SELECTED FEATURES:")
print(final_features)


‚úÖ SHAP feature ranking computed
üìÑ Saved at: /Users/prajitbaskaran/Downloads/synthetic_data/output/shap_feature_ranking.csv
üìà Top-1 features RF R¬≤: 0.3399
üìà Top-2 features RF R¬≤: 0.4883
üìà Top-3 features RF R¬≤: 0.7133
üìà Top-4 features RF R¬≤: 0.8292
üìà Top-5 features RF R¬≤: 0.9303
üìà Top-6 features RF R¬≤: 0.9548

‚úÖ RF performance tracking saved
üìÑ Saved at: /Users/prajitbaskaran/Downloads/synthetic_data/output/rf_performance_by_features.csv

‚úÖ Top-6 feature dataset saved
üì¶ File: /Users/prajitbaskaran/Downloads/synthetic_data/output/top6_features_dataset.csv

üèÅ FINAL SELECTED FEATURES:
['distance_km', 'promo_discount', 'time_min', 'active_orders', 'available_partners', 'base_fee']
