# Modelo de predicción

In [16]:
import pandas as pd
import numpy as np
import os
import pickle
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer # Added this import
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from scipy.stats import randint, uniform

1. Leer los datos procesados

In [17]:
df = pd.read_csv('../data/consumption_features.csv')
df.head()

Unnamed: 0,Passenger_Count,Unit_Cost,haul,Origin_DOH,Origin_JFK,Origin_LHR,Origin_MEX,Origin_NRT,Origin_ZRH,Product_BRD001,...,Product_SNK001,Percentage_Returned,day,flights,passengers,max capacity,Avg_Pass_Per_Flight_Day,Load_vs_Daily_Avg,Month,DayOfWeek
0,272,0.35,2,True,False,False,False,False,False,True,...,False,0.327684,0.0,0.0,0.0,0.0,0.0,0.0,,
1,272,0.8,2,True,False,False,False,False,False,False,...,False,0.326531,0.0,0.0,0.0,0.0,0.0,0.0,,
2,272,0.75,2,True,False,False,False,False,False,False,...,False,0.274809,0.0,0.0,0.0,0.0,0.0,0.0,,
3,272,0.45,2,True,False,False,False,False,False,False,...,False,0.180488,0.0,0.0,0.0,0.0,0.0,0.0,,
4,272,0.5,2,True,False,False,False,False,False,False,...,False,0.482234,0.0,0.0,0.0,0.0,0.0,0.0,9.0,4.0


## Preparación del dataset

1. Preparación de los datos

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# --- 2. Definición de X e Y ---
TARGET_COLUMN = 'Percentage_Returned'
Y = df[TARGET_COLUMN]
X = df.drop(TARGET_COLUMN, axis=1)

# Convertir booleanos a enteros (0/1) para compatibilidad de modelos
bool_cols = X.select_dtypes(include='bool').columns
X[bool_cols] = X[bool_cols].astype(int)

# --- 2. Dividir Datos (mismo split que en A3) ---
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.10, random_state=42)

# --- 3. Escalar Datos (para modelos que lo requieren) ---
scaler = StandardScaler()
X_train_scaled_raw = scaler.fit_transform(X_train)
X_test_scaled_raw = scaler.transform(X_test)

imputer = SimpleImputer(strategy='constant', fill_value=0)
X_train_scaled = imputer.fit_transform(X_train_scaled_raw)
X_test_scaled = imputer.transform(X_test_scaled_raw)

# Convertir de nuevo a DataFrame para mantener nombres de columnas (opcional pero útil)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)



## Modelos y Parámetros

In [19]:
# --- 4. Definir Modelos y Grillas de Hiperparámetros ---
# (Inspirado en el notebook del Titanic, pero para Regresión)

models_to_tune = [
    {
        'name': 'Ridge',
        'model': Ridge(),
        'params': {'alpha': [0.1, 1, 10, 100, 1000]},
        'search_type': 'grid',
        'scaled': True # Ridge se beneficia del escalado
    },
    {
        'name': 'RandomForestRegressor',
        'model': RandomForestRegressor(random_state=42),
        'params': {
            'n_estimators': randint(100, 500),
            'max_depth': randint(5, 30),
            'min_samples_leaf': randint(1, 10)
        },
        'search_type': 'random',
        'scaled': False # Los árboles no lo necesitan
    },
    {
        'name': 'SVR',
        'model': SVR(),
        'params': {
            'C': uniform(0.1, 100),
            'gamma': ['scale', 'auto', 0.01, 0.1],
            'kernel': ['rbf', 'poly']
        },
        'search_type': 'random',
        'scaled': True # SVR lo necesita
    },
    {
        'name': 'XGBRegressor',
        'model': XGBRegressor(random_state=42, objective='reg:squarederror'),
        'params': {
            'n_estimators': randint(100, 500),
            'learning_rate': uniform(0.01, 0.2),
            'max_depth': randint(3, 15),
            'subsample': uniform(0.7, 0.3) # (0.7 a 1.0)
        },
        'search_type': 'random',
        'scaled': False # Los árboles no lo necesitan
    }
]

## Entrenamiento del modelo

In [20]:
best_models = {}
test_results = {}
n_iter_random = 20 # Número de iteraciones para RandomizedSearch (ajústalo si quieres una búsqueda más larga)

# --- 5. Bucle de Entrenamiento y Optimización ---
print("\n--- Iniciando Optimización de Modelos ---")

for config in models_to_tune:
    name = config['name']
    print(f"\nOptimizando {name}...")
    
    # Elegir datos escalados o no escalados
    X_train_data = X_train_scaled if config['scaled'] else X_train
    
    if config['search_type'] == 'grid':
        search = GridSearchCV(
            estimator=config['model'],
            param_grid=config['params'],
            cv=5,
            scoring='neg_mean_absolute_error',
            n_jobs=-1,
            verbose=0
        )
    else: # 'random'
        search = RandomizedSearchCV(
            estimator=config['model'],
            param_distributions=config['params'],
            n_iter=n_iter_random,
            cv=5,
            scoring='neg_mean_absolute_error',
            n_jobs=-1,
            random_state=42,
            verbose=0
        )
    
    search.fit(X_train_data, Y_train)
    best_models[name] = search.best_estimator_
    print(f"Mejores parámetros para {name}: {search.best_params_}")
    print(f"Mejor MAE (CV): {-search.best_score_:.4f}")


--- Iniciando Optimización de Modelos ---

Optimizando Ridge...
Mejores parámetros para Ridge: {'alpha': 1000}
Mejor MAE (CV): 0.0757

Optimizando RandomForestRegressor...
Mejores parámetros para RandomForestRegressor: {'max_depth': 6, 'min_samples_leaf': 8, 'n_estimators': 393}
Mejor MAE (CV): 0.0775

Optimizando SVR...
Mejores parámetros para SVR: {'C': np.float64(20.067378215835976), 'gamma': 0.01, 'kernel': 'poly'}
Mejor MAE (CV): 0.0770

Optimizando XGBRegressor...
Mejores parámetros para XGBRegressor: {'learning_rate': np.float64(0.014116898859160489), 'max_depth': 4, 'n_estimators': 443, 'subsample': np.float64(0.9497327922401264)}
Mejor MAE (CV): 0.0814


In [21]:
# --- 6. Evaluación Final en el Test Set ---
print("\n--- Evaluación Final en el Test Set ---")

for name, model in best_models.items():
    
    # Elegir datos de test escalados o no escalados
    X_test_data = X_test_scaled if models_to_tune[next(i for i, d in enumerate(models_to_tune) if d['name'] == name)]['scaled'] else X_test
    
    Y_pred = model.predict(X_test_data)
    mae = mean_absolute_error(Y_test, Y_pred)
    r2 = r2_score(Y_test, Y_pred)
    
    test_results[name] = {'mae': mae, 'r2': r2}
    print(f"\n{name}:")
    print(f"  MAE (Test): {mae:.4f}")
    print(f"  R² (Test): {r2:.4f}")


--- Evaluación Final en el Test Set ---

Ridge:
  MAE (Test): 0.0837
  R² (Test): 0.0139

RandomForestRegressor:
  MAE (Test): 0.0864
  R² (Test): -0.0540

SVR:
  MAE (Test): 0.0828
  R² (Test): -0.0213

XGBRegressor:
  MAE (Test): 0.0930
  R² (Test): -0.2925


In [22]:
# --- 7. Encontrar y Guardar el Mejor Modelo ---
best_model_name = min(test_results, key=lambda k: test_results[k]['mae'])
best_mae = test_results[best_model_name]['mae']

print("\n--- --- --- --- ---")
print(f"El mejor modelo es: {best_model_name} con un MAE de {best_mae:.4f}")
if best_mae < 0.02:
    print("¡Felicidades! Se alcanzó el objetivo de MAE < 0.02.")
else:
    print(f"El MAE está a {best_mae - 0.02:.4f} puntos del objetivo (< 0.02).")

# Crear directorio para guardar modelos
output_dir = 'models'
os.makedirs(output_dir, exist_ok=True)
print(f"\nGuardando modelos en el directorio '{output_dir}'...")

saved_models = []
for name, model in best_models.items():
    # Guardamos todos los modelos optimizados, no solo el mejor
    filename = f"{output_dir}/{name.lower().replace(' ', '_')}_best.pkl"
    with open(filename, 'wb') as f:
        pickle.dump(model, f)
    saved_models.append(filename)

print("Modelos guardados:")
for f in saved_models:
    print(f"- {f}")


--- --- --- --- ---
El mejor modelo es: SVR con un MAE de 0.0828
El MAE está a 0.0628 puntos del objetivo (< 0.02).

Guardando modelos en el directorio 'models'...
Modelos guardados:
- models/ridge_best.pkl
- models/randomforestregressor_best.pkl
- models/svr_best.pkl
- models/xgbregressor_best.pkl


# Recommendations for Improving the Model

This section includes changes to improve the model's performance and reduce error.

In [23]:
# --- Improved Data Handling ---
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV

# Replace filling NaN with 0 by using mean imputation
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)


# Update parameter grid to replace 'auto' with 'sqrt'
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'max_features': ['sqrt', 'log2']  # Removed 'auto'
}

# Reinitialize GridSearchCV with error_score='raise' to debug invalid combinations
rf_grid_search = GridSearchCV(
    estimator=RandomForestRegressor(random_state=42, n_jobs=-1),
    param_grid=param_grid,
    scoring='neg_mean_absolute_error',
    cv=3,
    verbose=2,
    error_score='raise'  # Raise errors for invalid parameter combinations
)

# Fit GridSearchCV
print("Starting hyperparameter tuning with updated parameter grid...")
rf_grid_search.fit(X_train, Y_train)
print("Best parameters found:", rf_grid_search.best_params_)

Starting hyperparameter tuning with updated parameter grid...
Fitting 3 folds for each of 72 candidates, totalling 216 fits
[CV] END max_depth=None, max_features=sqrt, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END max_depth=None, max_features=sqrt, min_samples_split=2, n_estimators=100; total time=   0.3s
[CV] END max_depth=None, max_features=sqrt, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END max_depth=None, max_features=sqrt, min_samples_split=2, n_estimators=200; total time=   0.7s
[CV] END max_depth=None, max_features=sqrt, min_samples_split=2, n_estimators=200; total time=   0.7s
[CV] END max_depth=None, max_features=sqrt, min_samples_split=2, n_estimators=200; total time=   0.7s
[CV] END max_depth=None, max_features=sqrt, min_samples_split=2, n_estimators=300; total time=   1.1s
[CV] END max_depth=None, max_features=sqrt, min_samples_split=2, n_estimators=300; total time=   1.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_spli

In [24]:
# Use the best model
best_rf_model = rf_grid_search.best_estimator_

# Additional Metrics
from sklearn.metrics import mean_squared_error

# Predict using the best model
Y_pred_best_rf = best_rf_model.predict(X_test)

# Calculate additional metrics
# Some sklearn versions don't accept the `squared` kwarg, so compute RMSE manually
rf_rmse = np.sqrt(mean_squared_error(Y_test, Y_pred_best_rf))
print(f"Root Mean Squared Error (RMSE): {rf_rmse:.4f}")

# Feature Importance
importances = best_rf_model.feature_importances_
feature_names = X.columns

# Sort features by importance
sorted_indices = np.argsort(importances)[::-1]

print("\nFeature Importances:")
for idx in sorted_indices:
    print(f"{feature_names[idx]}: {importances[idx]:.4f}")

Root Mean Squared Error (RMSE): 0.0991

Feature Importances:
Passenger_Count: 0.2460
DayOfWeek: 0.1312
Unit_Cost: 0.1199
Month: 0.0497
Origin_JFK: 0.0416
Product_DRK023: 0.0385
Product_DRK024: 0.0369
haul: 0.0334
Origin_NRT: 0.0291
Origin_MEX: 0.0289
Product_NUT030: 0.0281
Product_COF200: 0.0272
Product_BRD001: 0.0267
Origin_DOH: 0.0261
Origin_LHR: 0.0216
Origin_ZRH: 0.0214
Product_CRK075: 0.0202
Product_SNK001: 0.0199
Product_HTB110: 0.0190
Product_JCE200: 0.0176
Product_CHO050: 0.0169
day: 0.0000
max capacity: 0.0000
Avg_Pass_Per_Flight_Day: 0.0000
Load_vs_Daily_Avg: 0.0000
passengers: 0.0000
flights: 0.0000


In [25]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mse = mean_squared_error(Y_test, Y_pred_best_rf)
print(f"Mean Squared Error (MSE): {mse:.4f}")

mae = mean_absolute_error(Y_test, Y_pred_best_rf)
print(f"Mean Absolute Error (MAE): {mae:.4f}")

r2 = r2_score(Y_test, Y_pred_best_rf)
print(f"R² (Coefficient of Determination): {r2:.4f}")

#antes
#Error Absoluto Medio (MAE) en Test: 0.0924
#R-cuadrado (R²) en Test: -0.3205

Mean Squared Error (MSE): 0.0098
Mean Absolute Error (MAE): 0.0871
R² (Coefficient of Determination): -0.0936
