In [1]:
# SOLUCIONES PARA OVERFITTING EXTREMO
# Ejecuta estas soluciones paso a paso

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt

print("SOLUCIONES PARA OVERFITTING EXTREMO")
print("=" * 50)

# Cargar datos
df = pd.read_csv('../data/dewatering_realistic_supplier_dataset.csv')

# SOLUCIÓN 1: ELIMINAR FEATURES QUE CAUSAN DATA LEAKAGE
print("SOLUCIÓN 1: ELIMINAR FEATURES PROBLEMÁTICAS")
print("-" * 40)

# Features originales problemáticas
problematic_features = ['country', 'quality_rating']  # Estas identifican directamente al proveedor

# Features limpias (sin data leakage obvio)
clean_features = [
    'price_usd', 
    'delivery_days',
    'payment_terms_days', 
    'shipping_included',
    'express_available',
    'order_urgency',
    'quantity_needed',
    'budget_available',
    'product_type',
    'incoterms',
    'month',
    'quarter'
]

print(f"Features eliminadas: {problematic_features}")
print(f"Features utilizadas: {clean_features}")

# Preparar datos limpios
X_clean = df[clean_features].copy()
y = df['supplier_name'].copy()

# Encoding
label_encoders = {}
categorical_features = ['order_urgency', 'product_type', 'incoterms', 'quarter']

for feature in categorical_features:
    if feature in X_clean.columns:
        le = LabelEncoder()
        X_clean[feature] = le.fit_transform(X_clean[feature].astype(str))
        label_encoders[feature] = le

# Convertir booleanos
boolean_features = ['shipping_included', 'express_available']
for feature in boolean_features:
    if feature in X_clean.columns:
        X_clean[feature] = X_clean[feature].astype(int)

# Encoder para target
target_encoder = LabelEncoder()
y_encoded = target_encoder.fit_transform(y)

# SOLUCIÓN 2: MODELO MUY RESTRICTIVO
print(f"\nSOLUCIÓN 2: PARÁMETROS ULTRA-CONSERVADORES")
print("-" * 40)

X_train, X_test, y_train, y_test = train_test_split(
    X_clean, y_encoded, test_size=0.3, random_state=42, stratify=y_encoded
)

# Modelo ultra-restrictivo
conservative_model = DecisionTreeClassifier(
    max_depth=3,              # MUY bajo
    min_samples_split=50,     # ALTO (mínimo 50 muestras para dividir)
    min_samples_leaf=20,      # ALTO (mínimo 20 muestras por hoja)
    max_features=5,           # Máximo 5 features por división
    random_state=42
)

conservative_model.fit(X_train, y_train)
y_pred_conservative = conservative_model.predict(X_test)
accuracy_conservative = accuracy_score(y_test, y_pred_conservative)

print(f"Accuracy modelo conservador: {accuracy_conservative:.3f}")
print(f"Profundidad: {conservative_model.get_depth()}")
print(f"Número de hojas: {conservative_model.get_n_leaves()}")

# SOLUCIÓN 3: VALIDACIÓN CRUZADA PARA VERIFICAR ESTABILIDAD
print(f"\nSOLUCIÓN 3: VALIDACIÓN CRUZADA")
print("-" * 40)

cv_scores = cross_val_score(conservative_model, X_clean, y_encoded, cv=5, scoring='accuracy')
print(f"CV Scores: {cv_scores}")
print(f"CV Mean: {cv_scores.mean():.3f} ± {cv_scores.std():.3f}")

if cv_scores.std() > 0.1:
    print("⚠️  Alta variabilidad entre folds - modelo inestable")
else:
    print("✅ Variabilidad aceptable entre folds")

# SOLUCIÓN 4: AÑADIR RUIDO A LOS DATOS
print(f"\nSOLUCIÓN 4: AÑADIR RUIDO PARA REDUCIR OVERFITTING")
print("-" * 40)

# Crear copia con ruido
X_noisy = X_clean.copy()

# Añadir ruido gaussiano a features numéricas
numeric_features = ['price_usd', 'delivery_days', 'payment_terms_days', 'quantity_needed', 'budget_available']
noise_factor = 0.05  # 5% de ruido

for feature in numeric_features:
    if feature in X_noisy.columns:
        std_dev = X_noisy[feature].std()
        noise = np.random.normal(0, std_dev * noise_factor, len(X_noisy))
        X_noisy[feature] = X_noisy[feature] + noise

print("Ruido añadido a features numéricas")

# Entrenar con datos ruidosos
X_train_noisy, X_test_noisy, y_train_noisy, y_test_noisy = train_test_split(
    X_noisy, y_encoded, test_size=0.3, random_state=42, stratify=y_encoded
)

noisy_model = DecisionTreeClassifier(
    max_depth=4,
    min_samples_split=30,
    min_samples_leaf=15,
    random_state=42
)

noisy_model.fit(X_train_noisy, y_train_noisy)
y_pred_noisy = noisy_model.predict(X_test_noisy)
accuracy_noisy = accuracy_score(y_test_noisy, y_pred_noisy)

print(f"Accuracy modelo con ruido: {accuracy_noisy:.3f}")

# SOLUCIÓN 5: MODELO ENSEMBLE SIMPLE
print(f"\nSOLUCIÓN 5: ENSEMBLE DE MODELOS SIMPLES")
print("-" * 40)

from sklearn.ensemble import RandomForestClassifier

# Random Forest muy conservador
rf_conservative = RandomForestClassifier(
    n_estimators=10,          # Pocos árboles
    max_depth=3,              # Muy poco profundo
    min_samples_split=50,     # Restrictivo
    min_samples_leaf=20,      # Restrictivo
    max_features=3,           # Pocas features por árbol
    random_state=42
)

rf_conservative.fit(X_train, y_train)
y_pred_rf = rf_conservative.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)

print(f"Accuracy Random Forest conservador: {accuracy_rf:.3f}")

# Feature importance del RF
rf_importance = pd.DataFrame({
    'feature': X_clean.columns,
    'importance': rf_conservative.feature_importances_
}).sort_values('importance', ascending=False)

print(f"\nImportancia de features (Random Forest):")
print(rf_importance.head(8))

# SOLUCIÓN 6: VALIDACIÓN TEMPORAL
print(f"\nSOLUCIÓN 6: VALIDACIÓN TEMPORAL")
print("-" * 40)

# Ordenar por fecha
df_temporal = df.copy()
df_temporal['date'] = pd.to_datetime(df_temporal['date'])
df_temporal = df_temporal.sort_values('date')

# División temporal: 80% más antiguo para train, 20% más reciente para test
split_idx = int(len(df_temporal) * 0.8)
train_temporal = df_temporal.iloc[:split_idx]
test_temporal = df_temporal.iloc[split_idx:]

print(f"Train temporal: {train_temporal['date'].min()} a {train_temporal['date'].max()}")
print(f"Test temporal: {test_temporal['date'].min()} a {test_temporal['date'].max()}")

# Preparar datos temporales
X_train_temp = train_temporal[clean_features].copy()
X_test_temp = test_temporal[clean_features].copy()
y_train_temp = train_temporal['supplier_name'].copy()
y_test_temp = test_temporal['supplier_name'].copy()

# Encoding temporal
for feature in categorical_features:
    if feature in X_train_temp.columns:
        le = LabelEncoder()
        X_train_temp[feature] = le.fit_transform(X_train_temp[feature].astype(str))
        # Para test, manejar categorías no vistas
        X_test_temp[feature] = X_test_temp[feature].astype(str)
        X_test_temp[feature] = X_test_temp[feature].apply(
            lambda x: le.transform([x])[0] if x in le.classes_ else 0
        )

for feature in boolean_features:
    if feature in X_train_temp.columns:
        X_train_temp[feature] = X_train_temp[feature].astype(int)
        X_test_temp[feature] = X_test_temp[feature].astype(int)

# Encoding de targets
y_train_temp_encoded = target_encoder.fit_transform(y_train_temp)
y_test_temp_encoded = target_encoder.transform(y_test_temp)

# Modelo temporal
temporal_model = DecisionTreeClassifier(
    max_depth=4,
    min_samples_split=40,
    min_samples_leaf=15,
    random_state=42
)

temporal_model.fit(X_train_temp, y_train_temp_encoded)
y_pred_temporal = temporal_model.predict(X_test_temp)
accuracy_temporal = accuracy_score(y_test_temp_encoded, y_pred_temporal)

print(f"Accuracy validación temporal: {accuracy_temporal:.3f}")

# RESUMEN DE RESULTADOS
print(f"\n" + "="*50)
print("RESUMEN DE SOLUCIONES Y RESULTADOS:")
print("="*50)

results = [
    ("Modelo conservador (sin country/quality)", accuracy_conservative),
    ("Modelo con ruido", accuracy_noisy),
    ("Random Forest conservador", accuracy_rf),
    ("Validación temporal", accuracy_temporal)
]

for description, acc in results:
    status = "✅ BUENO" if 0.4 <= acc <= 0.8 else "⚠️  REVISAR" if acc > 0.8 else "❌ BAJO"
    print(f"{description}: {acc:.3f} {status}")

print(f"\nRECOMENDACIONES FINALES:")
print("1. USA el modelo conservador sin country/quality_rating")
print("2. Accuracy objetivo: 0.50-0.70 para 6 proveedores")
print("3. Prefiere modelos simples y interpretables")
print("4. Valida siempre con datos temporales")
print("5. Si accuracy > 0.80, sospecha de overfitting")

# MODELO RECOMENDADO FINAL
print(f"\nMODELO RECOMENDADO PARA PRODUCCIÓN:")
print("-" * 40)

final_model = DecisionTreeClassifier(
    max_depth=3,
    min_samples_split=50,
    min_samples_leaf=20,
    random_state=42
)

final_model.fit(X_train, y_train)
y_pred_final = final_model.predict(X_test)
accuracy_final = accuracy_score(y_test, y_pred_final)

print(f"Accuracy modelo final: {accuracy_final:.3f}")
print(f"Profundidad: {final_model.get_depth()}")

# Guardar modelo recomendado
import joblib
import os
os.makedirs('../models', exist_ok=True)

joblib.dump(final_model, '../models/decision_tree_model_fixed.pkl')
joblib.dump(label_encoders, '../models/label_encoders_fixed.pkl')
joblib.dump(target_encoder, '../models/target_encoder_fixed.pkl')
joblib.dump(list(X_clean.columns), '../models/feature_names_fixed.pkl')

print(f"\n✅ Modelo corregido guardado en ../models/*_fixed.pkl")
print(f"✅ Listo para usar en sistema de recomendación")

SOLUCIONES PARA OVERFITTING EXTREMO
SOLUCIÓN 1: ELIMINAR FEATURES PROBLEMÁTICAS
----------------------------------------
Features eliminadas: ['country', 'quality_rating']
Features utilizadas: ['price_usd', 'delivery_days', 'payment_terms_days', 'shipping_included', 'express_available', 'order_urgency', 'quantity_needed', 'budget_available', 'product_type', 'incoterms', 'month', 'quarter']

SOLUCIÓN 2: PARÁMETROS ULTRA-CONSERVADORES
----------------------------------------
Accuracy modelo conservador: 0.898
Profundidad: 3
Número de hojas: 7

SOLUCIÓN 3: VALIDACIÓN CRUZADA
----------------------------------------
CV Scores: [0.89711934 0.90082645 0.90082645 0.89669421 0.89669421]
CV Mean: 0.898 ± 0.002
✅ Variabilidad aceptable entre folds

SOLUCIÓN 4: AÑADIR RUIDO PARA REDUCIR OVERFITTING
----------------------------------------
Ruido añadido a features numéricas
Accuracy modelo con ruido: 0.887

SOLUCIÓN 5: ENSEMBLE DE MODELOS SIMPLES
----------------------------------------
Accuracy R