In [None]:


import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
import joblib
from math import sqrt
# ---- USER SETTINGS ----
DATA_DIR = "/content/synthetic_supply_chain_data"
SHIPMENTS_CSV = os.path.join(DATA_DIR, "shipments.csv")
LANES_CSV = os.path.join(DATA_DIR, "lanes.csv")
WAREHOUSES_CSV = os.path.join(DATA_DIR, "warehouses.csv")
TRANSPORTS_CSV = os.path.join(DATA_DIR, "transports.csv")
OUTPUT_MODEL_PATH = "/content/compat_model.joblib"
RANDOM_SEED = 42
TEST_SIZE = 0.2
# ------------------------

print("üöÄ Starting compatibility model training...\n")

# --------------------------
# 1Ô∏è‚É£ Load data
# --------------------------
print("üìÇ Loading datasets from:", DATA_DIR)
shipments = pd.read_csv(SHIPMENTS_CSV)
lanes = pd.read_csv(LANES_CSV)
warehouses = pd.read_csv(WAREHOUSES_CSV)
transports = pd.read_csv(TRANSPORTS_CSV)

print(f"‚úÖ Shipments: {len(shipments)} rows")
print(f"‚úÖ Lanes: {len(lanes)} rows")
print(f"‚úÖ Warehouses: {len(warehouses)} rows")
print(f"‚úÖ Transports: {len(transports)} rows\n")

# --------------------------
# 2Ô∏è‚É£ Rename overlapping columns before merging
# --------------------------
lanes = lanes.rename(columns={
    "distance_km": "lane_distance_km",
    "delay_rate": "lane_delay_rate",
    "avg_lead_time_days": "lane_lead_time_days"
})

warehouses = warehouses.rename(columns={
    "avg_procurement_cost_per_sku": "wh_procurement_cost",
    "service_score": "wh_service_score"
})

transports = transports.rename(columns={
    "base_cost_per_km": "tr_base_cost_per_km",
    "reliability": "tr_reliability",
    "co2_kg_per_km": "tr_co2_kg_per_km"
})

# --------------------------
# 3Ô∏è‚É£ Merge all datasets
# --------------------------
df = shipments.merge(lanes, how="left",
                     on=["warehouse_id", "store_id", "transport_id"])
df = df.merge(warehouses, how="left", on="warehouse_id")
df = df.merge(transports, how="left", on="transport_id")

# Verify uniqueness
if df.columns.duplicated().any():
    print("‚ö†Ô∏è Warning: Duplicate columns still exist, removing duplicates...")
    df = df.loc[:, ~df.columns.duplicated()]

print(f"üßæ Final columns ({len(df.columns)}): {list(df.columns)}\n")

# --------------------------
# 4Ô∏è‚É£ Feature setup
# --------------------------
df = df.rename(columns={
    "quantity_units": "qty",
    "unit_landed_cost": "target_unit_cost"
})

feature_columns_numeric = [
    "qty", "lane_distance_km", "lane_delay_rate", "lane_lead_time_days",
    "wh_procurement_cost", "wh_service_score",
    "tr_base_cost_per_km", "tr_reliability", "tr_co2_kg_per_km"
]
cat_columns = ["warehouse_id", "store_id", "transport_id", "sku"]

# Ensure numeric columns exist
for c in feature_columns_numeric:
    if c not in df.columns:
        df[c] = 0.0

# Drop missing targets
df = df.dropna(subset=["target_unit_cost"]).reset_index(drop=True)

print(f"üß† Features: {len(feature_columns_numeric)} numeric, {len(cat_columns)} categorical")
print(f"üéØ Target: 'target_unit_cost' ({len(df)} samples)\n")

# --------------------------
# 5Ô∏è‚É£ Train/test split
# --------------------------
X = df[feature_columns_numeric + cat_columns]
y = df["target_unit_cost"].values
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_SEED
)
print(f"üìä Train: {len(X_train)}, Test: {len(X_test)}\n")

# --------------------------
# 6Ô∏è‚É£ Preprocessing pipeline
# --------------------------
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, feature_columns_numeric),
        ("cat", categorical_transformer, cat_columns),
    ]
)

# --------------------------
# 7Ô∏è‚É£ Model pipeline
# --------------------------
gbr = GradientBoostingRegressor(
    n_estimators=300, max_depth=4, learning_rate=0.05, random_state=RANDOM_SEED
)
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", gbr)
])

print("üèóÔ∏è Training compatibility model... (this may take ~20‚Äì30s)\n")
pipeline.fit(X_train, y_train)

# --------------------------
# 8Ô∏è‚É£ Evaluate
# --------------------------
y_pred = pipeline.predict(X_test)
rmse = sqrt(mean_squared_error(y_test, y_pred))
print(f"‚úÖ Validation RMSE: {rmse:.3f}\n")

# --------------------------
# 9Ô∏è‚É£ Feature importances
# --------------------------
try:
    model = pipeline.named_steps["model"]
    pre = pipeline.named_steps["preprocessor"]
    num_feats = feature_columns_numeric
    ohe = pre.named_transformers_["cat"]
    cat_feat_names = list(ohe.get_feature_names_out(cat_columns))
    feature_names = num_feats + cat_feat_names
    importances = model.feature_importances_
    imp_df = pd.DataFrame({"feature": feature_names, "importance": importances})
    imp_df = imp_df.sort_values("importance", ascending=False).reset_index(drop=True)
    print("üìà Top 15 feature importances:")
    print(imp_df.head(15), "\n")
except Exception as e:
    print("‚ö†Ô∏è Could not compute feature importances:", e)

# --------------------------
# üîü Save model
# --------------------------
joblib.dump(pipeline, OUTPUT_MODEL_PATH)
print(f"üíæ Saved trained compatibility pipeline to: {OUTPUT_MODEL_PATH}\n")

# Sanity check
sample_idx = np.random.randint(0, len(X_test))
sample = X_test.iloc[[sample_idx]]
pred = pipeline.predict(sample)[0]
print("üîç Sample prediction check:")
print(sample)
print(f"Predicted unit cost: {pred:.2f}\n")

print("üéâ Compatibility model training completed successfully!")


üöÄ Starting compatibility model training...

üìÇ Loading datasets from: /content/synthetic_supply_chain_data
‚úÖ Shipments: 2500 rows
‚úÖ Lanes: 240 rows
‚úÖ Warehouses: 6 rows
‚úÖ Transports: 4 rows

üßæ Final columns (22): ['sku', 'warehouse_id', 'store_id', 'transport_id', 'quantity_units', 'unit_landed_cost', 'lead_time_days', 'delayed', 'co2_emission_kg', 'lane_distance_km', 'lane_delay_rate', 'lane_lead_time_days', 'region', 'capacity_units', 'operating_cost_per_day', 'wh_procurement_cost', 'wh_service_score', 'mode', 'tr_base_cost_per_km', 'avg_speed_kmph', 'tr_reliability', 'tr_co2_kg_per_km']

üß† Features: 9 numeric, 4 categorical
üéØ Target: 'target_unit_cost' (2500 samples)

üìä Train: 2000, Test: 500

üèóÔ∏è Training compatibility model... (this may take ~20‚Äì30s)

‚úÖ Validation RMSE: 2.946

üìà Top 15 feature importances:
                feature  importance
0   wh_procurement_cost    0.356587
1                   qty    0.288769
2      lane_distance_km    0.0924