In [4]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
import joblib
from math import sqrt

# ============================================================
# 1Ô∏è‚É£ SET ROBUST PATHS
# ============================================================

CWD = os.getcwd()

def find_backend_dir(start_path):
    """Search upward for a folder containing 'data' and 'models'."""
    path = start_path
    while True:
        data_path = os.path.join(path, "data")
        models_path = os.path.join(path, "models")
        backend_path = os.path.join(path, "backend")

        # Case 1: inside backend
        if os.path.isdir(data_path) and os.path.isdir(models_path):
            return path

        # Case 2: one level above backend
        if os.path.isdir(backend_path) and \
           os.path.isdir(os.path.join(backend_path, "data")) and \
           os.path.isdir(os.path.join(backend_path, "models")):
            return backend_path

        parent = os.path.dirname(path)
        if parent == path:
            return None
        path = parent

BASE_DIR = find_backend_dir(CWD)

if BASE_DIR is None:
    print(f"‚ùå ERROR: Could not find 'data' or 'models' folders in or above {CWD}")
    raise FileNotFoundError("Could not auto-locate data/models directories.")
else:
    print(f"‚úÖ Base directory set to: {BASE_DIR}")
    os.chdir(BASE_DIR)
    print(f"üìÅ Working directory changed to: {os.getcwd()}")

# ============================================================
# 2Ô∏è‚É£ USER SETTINGS
# ============================================================

DATA_DIR = os.path.join(BASE_DIR, "data")
MODELS_DIR = os.path.join(BASE_DIR, "models")
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(MODELS_DIR, exist_ok=True)

SHIPMENTS_CSV = os.path.join(DATA_DIR, "shipments.csv")
LANES_CSV = os.path.join(DATA_DIR, "lanes.csv")
WAREHOUSES_CSV = os.path.join(DATA_DIR, "warehouses.csv")
TRANSPORTS_CSV = os.path.join(DATA_DIR, "transports.csv")

OUTPUT_MODEL_PATH = os.path.join(MODELS_DIR, "compat_model.joblib")

RANDOM_SEED = 42
TEST_SIZE = 0.2

# ============================================================
# 3Ô∏è‚É£ GENERATE SYNTHETIC DATA (if missing or empty)
# ============================================================

def generate_mock_data():
    np.random.seed(42)

    print("üß™ Generating synthetic mock data...")

    warehouses = pd.DataFrame({
        "warehouse_id": [f"W{i}" for i in range(1, 6)],
        "location": np.random.choice(["Mumbai", "Delhi", "Chennai", "Bangalore", "Hyderabad"], 5, replace=False),
        "avg_procurement_cost_per_sku": np.random.uniform(50, 200, 5).round(2),
        "service_score": np.random.uniform(0.7, 1.0, 5).round(2)
    })

    transports = pd.DataFrame({
        "transport_id": [f"T{i}" for i in range(1, 4)],
        "mode": ["Road", "Rail", "Air"],
        "base_cost_per_km": np.random.uniform(1.5, 6.0, 3).round(2),
        "reliability": np.random.uniform(0.8, 1.0, 3).round(2),
        "co2_kg_per_km": np.random.uniform(0.1, 1.0, 3).round(2)
    })

    stores = [f"S{i}" for i in range(1, 11)]

    # ‚úÖ Create ALL combinations of warehouse‚Äìstore‚Äìtransport to avoid NaNs
    lanes_data = []
    for w in warehouses["warehouse_id"]:
        for s in stores:
            for t in transports["transport_id"]:
                lanes_data.append({
                    "warehouse_id": w,
                    "store_id": s,
                    "transport_id": t,
                    "distance_km": np.random.uniform(100, 1500),
                    "delay_rate": np.random.uniform(0.01, 0.15),
                    "avg_lead_time_days": np.random.uniform(1, 10)
                })
    lanes = pd.DataFrame(lanes_data)

    skus = [f"SKU{i}" for i in range(1, 21)]
    shipments_data = []
    for _ in range(500):  # 500 shipment records
        wh = np.random.choice(warehouses["warehouse_id"])
        st = np.random.choice(stores)
        tr = np.random.choice(transports["transport_id"])
        sku = np.random.choice(skus)
        qty = np.random.randint(10, 500)
        base_cost = np.random.uniform(5, 50)

        # Always guaranteed to find lane match now
        distance = lanes.loc[
            (lanes["warehouse_id"] == wh) &
            (lanes["store_id"] == st) &
            (lanes["transport_id"] == tr),
            "distance_km"
        ].iloc[0]

        unit_landed_cost = (base_cost + 0.02 * distance + np.random.uniform(-2, 5))
        shipments_data.append({
            "warehouse_id": wh,
            "store_id": st,
            "transport_id": tr,
            "sku": sku,
            "quantity_units": qty,
            "unit_landed_cost": round(unit_landed_cost, 2)
        })

    shipments = pd.DataFrame(shipments_data)

    warehouses.to_csv(WAREHOUSES_CSV, index=False)
    transports.to_csv(TRANSPORTS_CSV, index=False)
    lanes.to_csv(LANES_CSV, index=False)
    shipments.to_csv(SHIPMENTS_CSV, index=False)

    print(f"‚úÖ Synthetic datasets created at {DATA_DIR}")
    print(f" Warehouses: {warehouses.shape}, Transports: {transports.shape}, Lanes: {lanes.shape}, Shipments: {shipments.shape}\n")

# Create data if missing or empty
if not os.path.exists(SHIPMENTS_CSV) or os.path.getsize(SHIPMENTS_CSV) == 0:
    generate_mock_data()

# ============================================================
# 4Ô∏è‚É£ LOAD DATA
# ============================================================

print("üìÇ Loading datasets from:", DATA_DIR)
shipments = pd.read_csv(SHIPMENTS_CSV)
lanes = pd.read_csv(LANES_CSV)
warehouses = pd.read_csv(WAREHOUSES_CSV)
transports = pd.read_csv(TRANSPORTS_CSV)

print(f"‚úÖ Shipments: {len(shipments)} rows")
print(f"‚úÖ Lanes: {len(lanes)} rows")
print(f"‚úÖ Warehouses: {len(warehouses)} rows")
print(f"‚úÖ Transports: {len(transports)} rows\n")

# ============================================================
# 5Ô∏è‚É£ RENAME + MERGE DATASETS
# ============================================================

lanes = lanes.rename(columns={
    "distance_km": "lane_distance_km",
    "delay_rate": "lane_delay_rate",
    "avg_lead_time_days": "lane_lead_time_days"
})

warehouses = warehouses.rename(columns={
    "avg_procurement_cost_per_sku": "wh_procurement_cost",
    "service_score": "wh_service_score"
})

transports = transports.rename(columns={
    "base_cost_per_km": "tr_base_cost_per_km",
    "reliability": "tr_reliability",
    "co2_kg_per_km": "tr_co2_kg_per_km"
})

df = shipments.merge(lanes, how="left", on=["warehouse_id", "store_id", "transport_id"])
df = df.merge(warehouses, how="left", on="warehouse_id")
df = df.merge(transports, how="left", on="transport_id")

# Remove duplicates if any
if df.columns.duplicated().any():
    df = df.loc[:, ~df.columns.duplicated()]

# ============================================================
# 6Ô∏è‚É£ CLEAN AND PREPARE FEATURES
# ============================================================

df = df.rename(columns={
    "quantity_units": "qty",
    "unit_landed_cost": "target_unit_cost"
})

feature_columns_numeric = [
    "qty", "lane_distance_km", "lane_delay_rate", "lane_lead_time_days",
    "wh_procurement_cost", "wh_service_score",
    "tr_base_cost_per_km", "tr_reliability", "tr_co2_kg_per_km"
]
cat_columns = ["warehouse_id", "store_id", "transport_id", "sku"]

# Fill NaNs (safety step)
df = df.fillna({
    "lane_distance_km": 500,
    "lane_delay_rate": 0.05,
    "lane_lead_time_days": 5,
    "wh_procurement_cost": 100,
    "wh_service_score": 0.85,
    "tr_base_cost_per_km": 3.0,
    "tr_reliability": 0.9,
    "tr_co2_kg_per_km": 0.5,
})
for c in cat_columns:
    df[c] = df[c].fillna("UNKNOWN")

# Drop missing targets
df = df.dropna(subset=["target_unit_cost"]).reset_index(drop=True)

print(f"üß† Features ready: {len(feature_columns_numeric)} numeric, {len(cat_columns)} categorical")
print(f"üéØ Samples for training: {len(df)}\n")

# ============================================================
# 7Ô∏è‚É£ TRAIN/TEST SPLIT + PIPELINE
# ============================================================

X = df[feature_columns_numeric + cat_columns]
y = df["target_unit_cost"].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_SEED
)

numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, feature_columns_numeric),
        ("cat", categorical_transformer, cat_columns),
    ]
)

gbr = GradientBoostingRegressor(
    n_estimators=300, max_depth=4, learning_rate=0.05, random_state=RANDOM_SEED
)

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", gbr)
])

print("üèóÔ∏è Training compatibility model...\n")
pipeline.fit(X_train, y_train)

# ============================================================
# 8Ô∏è‚É£ EVALUATE
# ============================================================

y_pred = pipeline.predict(X_test)
rmse = sqrt(mean_squared_error(y_test, y_pred))
print(f"‚úÖ Validation RMSE: {rmse:.3f}\n")

# ============================================================
# 9Ô∏è‚É£ FEATURE IMPORTANCES
# ============================================================

try:
    model = pipeline.named_steps["model"]
    pre = pipeline.named_steps["preprocessor"]
    ohe = pre.named_transformers_["cat"]
    cat_feat_names = list(ohe.get_feature_names_out(cat_columns))
    feature_names = feature_columns_numeric + cat_feat_names
    importances = model.feature_importances_
    imp_df = pd.DataFrame({"feature": feature_names, "importance": importances})
    imp_df = imp_df.sort_values("importance", ascending=False).reset_index(drop=True)
    print("üìà Top 10 feature importances:")
    print(imp_df.head(10), "\n")
except Exception as e:
    print("‚ö†Ô∏è Could not compute feature importances:", e)

# ============================================================
# üîü SAVE MODEL + SAMPLE PREDICTION
# ============================================================

joblib.dump(pipeline, OUTPUT_MODEL_PATH)
print(f"üíæ Model saved to: {OUTPUT_MODEL_PATH}\n")

sample_idx = np.random.randint(0, len(X_test))
sample = X_test.iloc[[sample_idx]]
pred = pipeline.predict(sample)[0]
print("üîç Sample prediction check:")
print(sample)
print(f"Predicted unit cost: {pred:.2f}\n")

print("üéâ Compatibility model training completed successfully!")


‚úÖ Base directory set to: c:\Users\sujal\Projects\mini\backend
üìÅ Working directory changed to: c:\Users\sujal\Projects\mini\backend
üìÇ Loading datasets from: c:\Users\sujal\Projects\mini\backend\data
‚úÖ Shipments: 500 rows
‚úÖ Lanes: 50 rows
‚úÖ Warehouses: 5 rows
‚úÖ Transports: 3 rows

üß† Features ready: 9 numeric, 4 categorical
üéØ Samples for training: 500

üèóÔ∏è Training compatibility model...

‚úÖ Validation RMSE: 15.703

üìà Top 10 feature importances:
               feature  importance
0                  qty    0.229806
1     lane_distance_km    0.140111
2  lane_lead_time_days    0.063807
3  wh_procurement_cost    0.036882
4      warehouse_id_W2    0.032206
5          store_id_S2    0.031749
6             sku_SKU2    0.023243
7      lane_delay_rate    0.021071
8          store_id_S6    0.020344
9  tr_base_cost_per_km    0.019889 

üíæ Model saved to: c:\Users\sujal\Projects\mini\backend\models\compat_model.joblib

üîç Sample prediction check:
     qty  lane_dista