In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
import numpy as np
import shap

# =========================
# LOAD & PREP DATA
# =========================
df = pd.read_csv("../data/processed/datos_procesados.csv")

y = df["Costo"]
X = df.drop(columns=["Costo", "Monto Real","Costo Prom", "CostoxTn", "Variacion", "Flete Falso (MXN)", "Flete Falso (USD)", "Monto Falso", "Shp.Cost", "Monto Reparto" ])

# train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

feature_names = X_train.columns.tolist()
print(feature_names)


In [None]:

# =========================
# XGBoost Model
# =========================
model = xgb.XGBRegressor(
    n_estimators=10,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror"
)

model.fit(X_train, y_train)

# =========================
# SHAP EXPLAINER
# =========================
# 1. Guardamos nombres

# 2. Convertimos a float64 + recrear DataFrames con nombres
X_train_named = pd.DataFrame(
    X_train.astype("float64").to_numpy(),
    columns=feature_names
)

X_test_named = pd.DataFrame(
    X_test.astype("float64").to_numpy(),
    columns=feature_names
)

# 3. Wrapper para el modelo
f = lambda x: model.predict(x)

# 4. SHAP explainer con nombres de features
explainer = shap.Explainer(f, X_train_named)
shap_values = explainer(X_test_named)

# 5. Plots
shap.plots.beeswarm(shap_values)
shap.plots.bar(shap_values)



In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# -------------------------------
# 1. Seleccionamos columnas
# -------------------------------
cluster_cols = [
    "Peso Total (kg)",
    "Distancia_km",
    "Costo",
    "CostoxTn",
    "Flete Falso (MXN)",
    "Monto Real",
    "Variacion"
]

# -------------------------------
# 2. Imputamos los NaNs
# -------------------------------
imputer = SimpleImputer(strategy="mean")
X_cluster_raw = imputer.fit_transform(df[cluster_cols])

# -------------------------------
# 3. Escalamos los DATOS
# -------------------------------
scaler = StandardScaler()
X_cluster = scaler.fit_transform(X_cluster_raw)

# -------------------------------
# 4. MÉTODO DEL CODO
# -------------------------------
inertia = []
K_range = range(2, 12)

for k in K_range:
    km = KMeans(n_clusters=k, random_state=42, n_init=10)
    km.fit(X_cluster)
    inertia.append(km.inertia_)

plt.plot(K_range, inertia, marker='o')
plt.xlabel("Número de clusters (k)")
plt.ylabel("Inercia")
plt.title("Método del Codo")
plt.grid(True)
plt.show()


In [None]:
k = 4
kmeans = KMeans(n_clusters=k, random_state=42)
df["cluster"] = kmeans.fit_predict(X_cluster)

df["cluster"].value_counts()


In [None]:
import pandas as pd
import numpy as np

# Centroides en escala original
centroids_scaled = kmeans.cluster_centers_
centroids_original = scaler.inverse_transform(centroids_scaled)

centroides_df = pd.DataFrame(centroids_original, columns=cluster_cols)
centroides_df.index = [f"Cluster {i}" for i in range(k)]

centroides_df


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8,6))
plt.scatter(df["Peso Total (kg)"], df["Costo"], c=df["cluster"], cmap="viridis")
plt.xlabel("Peso Total (kg)")
plt.ylabel("Costo")
plt.title("Clusters (K=4) - Peso vs Costo")
plt.show()


In [None]:
plt.figure(figsize=(8,6))
plt.scatter(df["Distancia_km"], df["Costo"], c=df["cluster"], cmap="viridis")
plt.xlabel("Distancia (km)")
plt.ylabel("Costo")
plt.title("Clusters (K=4) - Distancia vs Costo")
plt.show()


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA # Nuevo Import

# Cargamos datos procesados
np.random.seed(42)
N = 100
data = {
    "Peso Total (kg)": np.random.uniform(500, 5000, N),
    "Distancia_km": np.random.uniform(100, 1000, N),
    "Costo": np.random.uniform(1000, 15000, N),
    "CostoxTn": np.random.uniform(1, 10, N),
    "Flete Falso (MXN)": np.random.choice([0, 500, 1000], N, p=[0.8, 0.15, 0.05]),
    "Monto Real": np.random.uniform(10000, 50000, N),
    "Variacion": np.random.normal(0, 5, N)
}
df = pd.DataFrame(data)

# Introducimos algunos NaNs para demostrar la imputación
df.loc[df.sample(frac=0.05, random_state=42).index, "Peso Total (kg)"] = np.nan
df.loc[df.sample(frac=0.03, random_state=42).index, "Costo"] = np.nan

# ==============================================================================
# 1. Seleccionamos las columnas
# ==============================================================================
cluster_cols = [
    "Peso Total (kg)",
    "Distancia_km",
    "Costo",
    "CostoxTn",
    "Flete Falso (MXN)",
    "Monto Real",
    "Variacion"
]

# ==============================================================================
# 2. Imuptamos los NaNs
# ==============================================================================
imputer = SimpleImputer(strategy="mean")
X_cluster_raw = imputer.fit_transform(df[cluster_cols].values)

# ==============================================================================
# 3. Escalamos los Datos
# ==============================================================================
scaler = StandardScaler()
X_cluster = scaler.fit_transform(X_cluster_raw)

# ==============================================================================
# 4. Aplicamos PCA (Dimensionalidad = 2)
# ==============================================================================
n_components = 2
pca = PCA(n_components=n_components, random_state=42)
X_pca = pca.fit_transform(X_cluster)

# ==============================================================================
# 5. MÉTODO DEL CODO (Aplicado a los componentes PCA)
# ==============================================================================
inertia = []
K_range = range(2, 12)

for k_val in K_range:
    km = KMeans(n_clusters=k_val, random_state=42, n_init=10)
    km.fit(X_pca) # Clustering en el espacio PCA
    inertia.append(km.inertia_)

plt.figure(figsize=(10, 6))
plt.plot(K_range, inertia, marker='o')
plt.xlabel("Número de clusters (k)")
plt.ylabel("Inercia")
plt.title("Método del Codo (en el espacio PCA)")
plt.grid(True)
plt.savefig("elbow_method_pca.png")
plt.close()

# ==============================================================================
# 6. Aplicamos K-MEANS CON k=4 Y Asignamos los Clusters
# ==============================================================================
k = 4
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
# Asignar los clusters al DataFrame original
df["cluster_pca"] = kmeans.fit_predict(X_pca)

# ==============================================================================
# 7. ANÁLISIS DE CENTROIDES (Transformación Inversa)
# ==============================================================================
# 1. Centros en el espacio PCA
centroids_pca = kmeans.cluster_centers_

# 2. Transformamos de PCA a Espacio Estandarizado
centroids_scaled = pca.inverse_transform(centroids_pca)

# 3. Transformamos de Estandarizado a Espacio Original
centroids_original = scaler.inverse_transform(centroids_scaled)

# Creamos DataFrame de centroides
centroides_df = pd.DataFrame(centroids_original, columns=cluster_cols)
centroides_df.index = [f"Cluster {i}" for i in range(k)]
centroides_df.to_csv("centroides_pca_kmeans.csv", index=True)

# ==============================================================================
# 8. VISUALIZACIÓN DE CLUSTERS
# ==============================================================================

# Gráfico A: Clusters en el Espacio PCA (PC1 vs PC2)
plt.figure(figsize=(8,6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=df["cluster_pca"], cmap="viridis")
plt.xlabel("Componente Principal 1")
plt.ylabel("Componente Principal 2")
plt.title(f"Clusters (K={k}) en el Espacio PCA")
plt.colorbar(label='Cluster ID')
plt.grid(True)
plt.savefig("pca_clusters.png")
plt.close()

# Gráfico B: Clusters en variables originales (Peso vs Costo)
plt.figure(figsize=(8,6))
plt.scatter(df["Peso Total (kg)"], df["Costo"], c=df["cluster_pca"], cmap="viridis")
plt.xlabel("Peso Total (kg)")
plt.ylabel("Costo")
plt.title(f"Clusters (K={k}, PCA-KMeans) - Peso vs Costo")
plt.colorbar(label='Cluster ID')
plt.grid(True)
plt.savefig("peso_costo_clusters_pca.png")
plt.close()

# Gráfico C: Clusters en variables originales (Distancia vs Costo)
plt.figure(figsize=(8,6))
plt.scatter(df["Distancia_km"], df["Costo"], c=df["cluster_pca"], cmap="viridis")
plt.xlabel("Distancia (km)")
plt.ylabel("Costo")
plt.title(f"Clusters (K={k}, PCA-KMeans) - Distancia vs Costo")
plt.colorbar(label='Cluster ID')
plt.grid(True)
plt.savefig("distancia_costo_clusters_pca.png")
plt.close()

print(f"Varianza Explicada por los {n_components} componentes principales: {np.sum(pca.explained_variance_ratio_):.2f}")