
# 🚚 Supply Chain ML (Colab Ready) — v3
Correcciones:
- **RMSE** calculado como `np.sqrt(mean_squared_error(...))` (evita incompatibilidades con el parámetro `squared`).
- **OneHotEncoder** compatible con todas las versiones de scikit-learn (`sparse_output` → *fallback* a `sparse`).

Incluye dos tareas sobre el dataset `dynamic_supply_chain_logistics_dataset.csv`:
- **Clasificación**: `risk_classification`
- **Regresión**: `delivery_time_deviation`


In [None]:

# @title 🔧 Instalar dependencias (si usas Colab)
try:
    import pandas as pd, numpy as np, lightgbm as lgb
    import matplotlib.pyplot as plt, sklearn
except Exception:
    !pip -q install pandas numpy scikit-learn lightgbm matplotlib pyarrow
import pandas as pd, numpy as np, lightgbm as lgb, matplotlib.pyplot as plt, sklearn
from IPython.display import display
print("pandas:", pd.__version__, "| sklearn:", sklearn.__version__)


In [None]:

# @title ⚙️ Localización de datos y salidas
import os, warnings, pandas as pd
warnings.filterwarnings("ignore")

CANDIDATES = [
    "/content/dynamic_supply_chain_logistics_dataset.csv",
    "/content/sample_data/dynamic_supply_chain_logistics_dataset.csv",
    "/content/drive/MyDrive/dynamic_supply_chain_logistics_dataset.csv",
    "dynamic_supply_chain_logistics_dataset.csv"
]
DATA_PATH = next((p for p in CANDIDATES if os.path.exists(p)), None)
assert DATA_PATH is not None, "❌ Sube dynamic_supply_chain_logistics_dataset.csv a /content o sample_data."
print("✅ Usando:", DATA_PATH)

OUT_DIR = "/content/outputs"
os.makedirs(OUT_DIR, exist_ok=True)


In [None]:

# @title 📥 Carga + vista rápida + diccionario
df = pd.read_csv(DATA_PATH, parse_dates=["timestamp"])
df.columns = [c.strip().lower().replace(" ", "_").replace("/", "_").replace("-", "_") for c in df.columns]
print(df.shape)
display(df.head())

dd = pd.DataFrame({
    "column": df.columns,
    "dtype": [str(df[c].dtype) for c in df.columns],
    "nulls": [int(df[c].isna().sum()) for c in df.columns],
    "unique": [int(df[c].nunique()) for c in df.columns]
})
dd.to_csv(f"{OUT_DIR}/feature_schema.csv", index=False)
display(dd.head(30))

# Calendario diario útil para dashboards
if "timestamp" in df.columns:
    cal = (df.assign(date=pd.to_datetime(df["timestamp"]).dt.date)
             .groupby("date", as_index=False)
             .agg(orders_count=("timestamp","count"),
                  avg_delay=("delivery_time_deviation","mean") if "delivery_time_deviation" in df.columns else ("timestamp","count"),
                  avg_delay_probability=("delay_probability","mean") if "delay_probability" in df.columns else ("timestamp","count")))
    cal["date"] = pd.to_datetime(cal["date"])
    cal["dow"] = cal["date"].dt.weekday
    cal["month"] = cal["date"].dt.month
    cal["year"] = cal["date"].dt.year
    cal["is_weekend"] = (cal["dow"]>=5).astype(int)
    cal.to_csv(f"{OUT_DIR}/logistics_calendar.csv", index=False)
    display(cal.head())


In [None]:

# @title 🧩 Preprocesamiento (OneHotEncoder compatible)
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from pandas.api.types import is_numeric_dtype

def split_features(df, drop_cols=[]):
    X = df.drop(columns=drop_cols, errors="ignore").copy()
    num_cols = [c for c in X.columns if is_numeric_dtype(X[c]) and c not in ["timestamp"]]
    cat_cols = [c for c in X.columns if c not in num_cols and c != "timestamp"]
    if "timestamp" in df.columns:
        ts = pd.to_datetime(df["timestamp"])
        X["ts_year"] = ts.dt.year
        X["ts_month"] = ts.dt.month
        X["ts_dow"] = ts.dt.weekday
        X["ts_hour"] = ts.dt.hour
        num_cols += ["ts_year","ts_month","ts_dow","ts_hour"]
    return X, num_cols, cat_cols

def make_preprocessor(num_cols, cat_cols):
    try:
        ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=True)
    except TypeError:
        ohe = OneHotEncoder(handle_unknown="ignore", sparse=True)
    return ColumnTransformer(
        transformers=[
            ("num", StandardScaler(with_mean=False), num_cols),
            ("cat", ohe, cat_cols),
        ],
        remainder="drop"
    )


## 🔴 Clasificación: `risk_classification`

In [None]:

from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.preprocessing import LabelEncoder

if "risk_classification" in df.columns:
    y = df["risk_classification"].astype(str)
    X, num_cols, cat_cols = split_features(df, drop_cols=["risk_classification"])
    pre = make_preprocessor(num_cols, cat_cols)

    le = LabelEncoder()
    y_enc = le.fit_transform(y)

    X_train, X_test, y_train, y_test = train_test_split(X, y_enc, test_size=0.2, random_state=42, stratify=y_enc)

    clf = lgb.LGBMClassifier(
        n_estimators=1200,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    )

    pipe_cls = Pipeline([("prep", pre), ("model", clf)])
    pipe_cls.fit(X_train, y_train)

    y_pred = pipe_cls.predict(X_test)
    y_proba = pipe_cls.predict_proba(X_test)

    report = classification_report(y_test, y_pred, target_names=le.classes_, output_dict=True)
    rep_df = pd.DataFrame(report).transpose()
    rep_df.to_csv(f"{OUT_DIR}/metrics_classification.csv")

    try:
        auc_ovr = roc_auc_score(y_test, y_proba, multi_class="ovr")
    except Exception:
        auc_ovr = np.nan

    cm = confusion_matrix(y_test, y_pred)
    cm_df = pd.DataFrame(cm, index=le.classes_, columns=le.classes_)
    cm_df.to_csv(f"{OUT_DIR}/confusion_matrix.csv")

    preds_df = X_test.copy()
    preds_df["y_true"] = le.inverse_transform(y_test)
    preds_df["y_pred"] = le.inverse_transform(y_pred)
    preds_df.to_csv(f"{OUT_DIR}/predictions_classification.csv", index=False)

    print("Accuracy:", report.get("accuracy"))
    print("Macro F1:", rep_df.loc["macro avg","f1-score"] if "macro avg" in rep_df.index else np.nan)
    print("AUC OvR:", auc_ovr)
    display(rep_df.head(10))
    display(cm_df)
else:
    print("Columna 'risk_classification' no existe. Se omite la sección de clasificación.")


## 🔵 Regresión: `delivery_time_deviation`

In [None]:

from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

if "delivery_time_deviation" in df.columns:
    y = df["delivery_time_deviation"].astype(float)
    X, num_cols, cat_cols = split_features(df, drop_cols=["delivery_time_deviation"])
    pre = make_preprocessor(num_cols, cat_cols)

    reg = lgb.LGBMRegressor(
        n_estimators=1200,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    )

    pipe_reg = Pipeline([("prep", pre), ("model", reg)])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    pipe_reg.fit(X_train, y_train)
    y_hat = pipe_reg.predict(X_test)

    mae = mean_absolute_error(y_test, y_hat)
    # --- FIX: RMSE sin usar `squared` kw ---
    rmse = np.sqrt(mean_squared_error(y_test, y_hat))
    r2 = r2_score(y_test, y_hat)

    met_df = pd.DataFrame([{"MAE": mae, "RMSE": rmse, "R2": r2}])
    met_df.to_csv(f"{OUT_DIR}/metrics_regression.csv", index=False)
    print(met_df)

    preds_df = X_test.copy()
    preds_df["y_true"] = y_test.values
    preds_df["y_pred"] = y_hat
    preds_df.to_csv(f"{OUT_DIR}/predictions_regression.csv", index=False)
else:
    print("Columna 'delivery_time_deviation' no existe. Se omite la sección de regresión.")


In [None]:

# @title Importancias (perm. importance) sobre el modelo de regresión si está disponible
from sklearn.inspection import permutation_importance

try:
    r = permutation_importance(pipe_reg, X_test, y_test, n_repeats=3, random_state=42, scoring="neg_mean_squared_error")
    num_names = pipe_reg.named_steps["prep"].named_transformers_["num"].get_feature_names_out()
    cat_names = pipe_reg.named_steps["prep"].named_transformers_["cat"].get_feature_names_out()
    names = list(num_names) + list(cat_names)
    imp_df = pd.DataFrame({"feature": names, "importance": r.importances_mean}).sort_values("importance", ascending=False)
    imp_df.to_csv(f"{OUT_DIR}/feature_importance.csv", index=False)
    display(imp_df.head(20))
except Exception as e:
    print("No se pudieron calcular importancias (quizá no se entrenó la regresión). Detalle:", e)
