In [1]:

# ============================================================================
# 03_TRAIN_MODELS.IPYNB
# Entrenamiento y Evaluación de Modelos ML para Agricultura Vertical
# ============================================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.model_selection import cross_val_score, GridSearchCV
import xgboost as xgb
import lightgbm as lgb
import joblib
import json
import time
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Configuración de visualización
plt.style.use('default')
sns.set_palette("husl")

print("="*60)
print("ENTRENAMIENTO DE MODELOS - AGRICULTURA VERTICAL")
print("="*60)


ENTRENAMIENTO DE MODELOS - AGRICULTURA VERTICAL


1. CARGA DE DATOS FEATURES ENGINEERED

In [2]:
# ============================================================================
# 1. CARGA DE DATOS FEATURES ENGINEERED
# ============================================================================

print("\n1. CARGANDO DATOS CON FEATURES ENGINEERED...")

# Cargar datos con features finales
train_data = pd.read_csv('../data/processed/train_featured.csv')
val_data = pd.read_csv('../data/processed/validation_featured.csv')
test_data = pd.read_csv('../data/processed/test_featured.csv')

# Cargar metadata
with open('../data/processed/feature_engineering_metadata.json', 'r') as f:
    feature_metadata = json.load(f)

selected_features = feature_metadata['selected_features']

print(f"Train shape: {train_data.shape}")
print(f"Validation shape: {val_data.shape}")
print(f"Test shape: {test_data.shape}")
print(f"Features seleccionadas: {len(selected_features)}")

# Separar features y targets
X_train = train_data[selected_features]
X_val = val_data[selected_features]
X_test = test_data[selected_features]

y_train_eficiencia = train_data['eficiencia_fotosintetica_pct']
y_val_eficiencia = val_data['eficiencia_fotosintetica_pct']
y_test_eficiencia = test_data['eficiencia_fotosintetica_pct']

# Para fotoluminiscencia (incluye eficiencia como feature)
X_train_foto = train_data[selected_features + ['eficiencia_fotosintetica_pct']]
X_val_foto = val_data[selected_features + ['eficiencia_fotosintetica_pct']]
X_test_foto = test_data[selected_features + ['eficiencia_fotosintetica_pct']]

y_train_foto = train_data['fotoluminiscencia_intensidad']
y_val_foto = val_data['fotoluminiscencia_intensidad']
y_test_foto = test_data['fotoluminiscencia_intensidad']

print(f"\nDatos separados correctamente:")
print(f"X_train eficiencia: {X_train.shape}")
print(f"X_train fotoluminiscencia: {X_train_foto.shape}")



1. CARGANDO DATOS CON FEATURES ENGINEERED...
Train shape: (30000, 35)
Validation shape: (10000, 35)
Test shape: (10000, 35)
Features seleccionadas: 33

Datos separados correctamente:
X_train eficiencia: (30000, 33)
X_train fotoluminiscencia: (30000, 34)


2. CONFIGURACIÓN DE MODELOS

In [3]:
# ============================================================================
# 2. CONFIGURACIÓN DE MODELOS
# ============================================================================

print("\n2. CONFIGURANDO MODELOS...")

class ModelTrainer:
    def __init__(self):
        self.models = {}
        self.scalers = {}
        self.results = {}
        self.training_time = {}

    def get_models_config(self):
        """Configuración de modelos a entrenar"""
        models_config = {
            # Modelos lineales
            'LinearRegression': {
                'model': LinearRegression(),
                'scale': False,
                'params': {}
            },
            'Ridge': {
                'model': Ridge(),
                'scale': True,
                'params': {
                    'alpha': [0.1, 1.0, 10.0, 100.0]
                }
            },
            'Lasso': {
                'model': Lasso(),
                'scale': True,
                'params': {
                    'alpha': [0.01, 0.1, 1.0, 10.0]
                }
            },
            'ElasticNet': {
                'model': ElasticNet(),
                'scale': True,
                'params': {
                    'alpha': [0.01, 0.1, 1.0],
                    'l1_ratio': [0.1, 0.5, 0.9]
                }
            },

            # Modelos basados en árboles
            'RandomForest': {
                'model': RandomForestRegressor(random_state=42),
                'scale': False,
                'params': {
                    'n_estimators': [100, 200],
                    'max_depth': [10, 20, None],
                    'min_samples_split': [2, 5],
                    'min_samples_leaf': [1, 2]
                }
            },
            'GradientBoosting': {
                'model': GradientBoostingRegressor(random_state=42),
                'scale': False,
                'params': {
                    'n_estimators': [100, 200],
                    'learning_rate': [0.05, 0.1, 0.2],
                    'max_depth': [3, 5, 7]
                }
            },
            'XGBoost': {
                'model': xgb.XGBRegressor(random_state=42),
                'scale': False,
                'params': {
                    'n_estimators': [100, 200],
                    'learning_rate': [0.05, 0.1, 0.2],
                    'max_depth': [3, 5, 7],
                    'subsample': [0.8, 1.0]
                }
            },
            'LightGBM': {
                'model': lgb.LGBMRegressor(random_state=42, verbose=-1),
                'scale': False,
                'params': {
                    'n_estimators': [100, 200],
                    'learning_rate': [0.05, 0.1, 0.2],
                    'max_depth': [3, 5, 7],
                    'num_leaves': [31, 50]
                }
            },

            # Otros modelos
            'SVR': {
                'model': SVR(),
                'scale': True,
                'params': {
                    'C': [0.1, 1, 10, 100],
                    'gamma': ['scale', 'auto'],
                    'kernel': ['rbf', 'linear']
                }
            },
            'MLPRegressor': {
                'model': MLPRegressor(random_state=42, max_iter=500),
                'scale': True,
                'params': {
                    'hidden_layer_sizes': [(50,), (100,), (50, 50)],
                    'alpha': [0.001, 0.01, 0.1],
                    'learning_rate_init': [0.001, 0.01]
                }
            }
        }

        return models_config

    def prepare_data(self, X_train, X_val, X_test, scale=False, scaler_name='scaler'):
        """Preparar datos con escalado si es necesario"""
        if scale:
            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train)
            X_val_scaled = scaler.transform(X_val)
            X_test_scaled = scaler.transform(X_test)

            self.scalers[scaler_name] = scaler

            return X_train_scaled, X_val_scaled, X_test_scaled
        else:
            return X_train.values, X_val.values, X_test.values

    def evaluate_model(self, y_true, y_pred, model_name, dataset_name):
        """Evaluar modelo con múltiples métricas"""
        metrics = {
            'MAE': mean_absolute_error(y_true, y_pred),
            'MSE': mean_squared_error(y_true, y_pred),
            'RMSE': np.sqrt(mean_squared_error(y_true, y_pred)),
            'R2': r2_score(y_true, y_pred),
            'MAPE': mean_absolute_percentage_error(y_true, y_pred)
        }

        return metrics

    def hyperparameter_tuning(self, model, params, X_train, y_train, cv=3):
        """Optimización de hiperparámetros"""
        if len(params) > 0:
            grid_search = GridSearchCV(
                model,
                params,
                cv=cv,
                scoring='neg_mean_absolute_error',
                n_jobs=-1
            )
            grid_search.fit(X_train, y_train)
            return grid_search.best_estimator_, grid_search.best_params_
        else:
            model.fit(X_train, y_train)
            return model, {}

    def train_models(self, X_train, X_val, X_test, y_train, y_val, y_test,
                    target_name, quick_mode=False):
        """Entrenar todos los modelos para un target específico"""

        models_config = self.get_models_config()
        results = {}

        print(f"\nEntrenando modelos para: {target_name}")
        print("-" * 40)

        for model_name, config in models_config.items():
            print(f"\nEntrenando {model_name}...")
            start_time = time.time()

            try:
                # Preparar datos
                scaler_name = f"{target_name}_{model_name}"
                X_train_prep, X_val_prep, X_test_prep = self.prepare_data(
                    X_train, X_val, X_test,
                    scale=config['scale'],
                    scaler_name=scaler_name
                )

                # Optimización de hiperparámetros (reducida en modo rápido)
                if quick_mode:
                    # Usar solo algunos parámetros en modo rápido
                    limited_params = {}
                    for key, values in config['params'].items():
                        if len(values) > 2:
                            limited_params[key] = values[:2]
                        else:
                            limited_params[key] = values
                    params_to_use = limited_params
                else:
                    params_to_use = config['params']

                best_model, best_params = self.hyperparameter_tuning(
                    config['model'], params_to_use, X_train_prep, y_train
                )

                # Predicciones
                y_train_pred = best_model.predict(X_train_prep)
                y_val_pred = best_model.predict(X_val_prep)
                y_test_pred = best_model.predict(X_test_prep)

                # Evaluación
                train_metrics = self.evaluate_model(y_train, y_train_pred, model_name, 'train')
                val_metrics = self.evaluate_model(y_val, y_val_pred, model_name, 'val')
                test_metrics = self.evaluate_model(y_test, y_test_pred, model_name, 'test')

                # Tiempo de entrenamiento
                training_time = time.time() - start_time

                # Guardar resultados
                results[model_name] = {
                    'model': best_model,
                    'best_params': best_params,
                    'train_metrics': train_metrics,
                    'val_metrics': val_metrics,
                    'test_metrics': test_metrics,
                    'training_time': training_time,
                    'predictions': {
                        'train': y_train_pred,
                        'val': y_val_pred,
                        'test': y_test_pred
                    }
                }

                print(f"  - Train MAE: {train_metrics['MAE']:.3f}, R2: {train_metrics['R2']:.3f}")
                print(f"  - Val MAE: {val_metrics['MAE']:.3f}, R2: {val_metrics['R2']:.3f}")
                print(f"  - Test MAE: {test_metrics['MAE']:.3f}, R2: {test_metrics['R2']:.3f}")
                print(f"  - Tiempo: {training_time:.1f}s")
                print(f"  - Mejores params: {best_params}")

            except Exception as e:
                print(f"  - ERROR: {str(e)}")
                continue

        self.results[target_name] = results
        return results

# Inicializar trainer
trainer = ModelTrainer()



2. CONFIGURANDO MODELOS...


3. ENTRENAMIENTO PARA EFICIENCIA FOTOSINTÉTICA

In [None]:
# ============================================================================
# 3. ENTRENAMIENTO PARA EFICIENCIA FOTOSINTÉTICA
# ============================================================================

print("\n3. ENTRENAMIENTO PARA EFICIENCIA FOTOSINTETICA...")

# Entrenar modelos (usar quick_mode=True para pruebas rápidas)
eficiencia_results = trainer.train_models(
    X_train, X_val, X_test,
    y_train_eficiencia, y_val_eficiencia, y_test_eficiencia,
    target_name='eficiencia',
    quick_mode=False  # Cambiar a True para entrenamiento rápido
)


3. ENTRENAMIENTO PARA EFICIENCIA FOTOSINTETICA...

Entrenando modelos para: eficiencia
----------------------------------------

Entrenando LinearRegression...
  - Train MAE: 0.000, R2: 1.000
  - Val MAE: 0.000, R2: 1.000
  - Test MAE: 0.000, R2: 1.000
  - Tiempo: 0.1s
  - Mejores params: {}

Entrenando Ridge...
  - Train MAE: 0.000, R2: 1.000
  - Val MAE: 0.000, R2: 1.000
  - Test MAE: 0.000, R2: 1.000
  - Tiempo: 3.6s
  - Mejores params: {'alpha': 0.1}

Entrenando Lasso...
  - Train MAE: 0.008, R2: 1.000
  - Val MAE: 0.008, R2: 1.000
  - Test MAE: 0.008, R2: 1.000
  - Tiempo: 1.1s
  - Mejores params: {'alpha': 0.01}

Entrenando ElasticNet...
  - Train MAE: 0.018, R2: 1.000
  - Val MAE: 0.018, R2: 1.000
  - Test MAE: 0.018, R2: 1.000
  - Tiempo: 1.4s
  - Mejores params: {'alpha': 0.01, 'l1_ratio': 0.9}

Entrenando RandomForest...
  - Train MAE: 0.001, R2: 1.000
  - Val MAE: 0.001, R2: 1.000
  - Test MAE: 0.001, R2: 1.000
  - Tiempo: 1864.3s
  - Mejores params: {'max_depth': 20, 'min_

4. ENTRENAMIENTO PARA FOTOLUMINISCENCIA

In [None]:
print("\n4. ENTRENAMIENTO PARA FOTOLUMINISCENCIA...")

# Entrenar modelos para fotoluminiscencia
fotolum_results = trainer.train_models(
    X_train_foto, X_val_foto, X_test_foto,
    y_train_foto, y_val_foto, y_test_foto,
    target_name='fotoluminiscencia',
    quick_mode=False
)

5. COMPARACIÓN DE MODELOS

In [None]:
# ============================================================================
# 5. COMPARACIÓN DE MODELOS
# ============================================================================

print("\n5. COMPARACION DE MODELOS...")

def create_comparison_table(results_dict):
    """Crear tabla comparativa de resultados"""
    comparison_data = []

    for target_name, target_results in results_dict.items():
        for model_name, model_results in target_results.items():
            row = {
                'Target': target_name,
                'Model': model_name,
                'Train_MAE': model_results['train_metrics']['MAE'],
                'Val_MAE': model_results['val_metrics']['MAE'],
                'Test_MAE': model_results['test_metrics']['MAE'],
                'Train_R2': model_results['train_metrics']['R2'],
                'Val_R2': model_results['val_metrics']['R2'],
                'Test_R2': model_results['test_metrics']['R2'],
                'Training_Time': model_results['training_time']
            }
            comparison_data.append(row)

    comparison_df = pd.DataFrame(comparison_data)
    return comparison_df

# Crear tabla comparativa
all_results = trainer.results
comparison_df = create_comparison_table(all_results)

# Mostrar mejores modelos por target
print("\nRESULTADOS COMPARATIVOS:")
print("="*80)

for target in ['eficiencia', 'fotoluminiscencia']:
    target_df = comparison_df[comparison_df['Target'] == target].copy()
    target_df = target_df.sort_values('Test_MAE')

    print(f"\n{target.upper()}:")
    print(target_df[['Model', 'Test_MAE', 'Test_R2', 'Training_Time']].head(5).to_string(index=False))

    # Mejor modelo
    best_model = target_df.iloc[0]
    print(f"\nMEJOR MODELO: {best_model['Model']}")
    print(f"  - Test MAE: {best_model['Test_MAE']:.3f}")
    print(f"  - Test R2: {best_model['Test_R2']:.3f}")
    print(f"  - Tiempo entrenamiento: {best_model['Training_Time']:.1f}s")

 6. ANÁLISIS DE RESIDUOS

In [None]:
# ============================================================================
# 6. ANÁLISIS DE RESIDUOS
# ============================================================================

print("\n6. ANALISIS DE RESIDUOS...")

def plot_residuals_analysis(y_true, y_pred, model_name, target_name):
    """Análisis de residuos para un modelo"""
    residuals = y_true - y_pred

    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    fig.suptitle(f'Análisis de Residuos - {model_name} ({target_name})')

    # 1. Residuos vs Predicciones
    axes[0,0].scatter(y_pred, residuals, alpha=0.6)
    axes[0,0].axhline(y=0, color='red', linestyle='--')
    axes[0,0].set_xlabel('Predicciones')
    axes[0,0].set_ylabel('Residuos')
    axes[0,0].set_title('Residuos vs Predicciones')

    # 2. Q-Q Plot
    from scipy import stats
    stats.probplot(residuals, dist="norm", plot=axes[0,1])
    axes[0,1].set_title('Q-Q Plot')

    # 3. Histograma de residuos
    axes[1,0].hist(residuals, bins=30, alpha=0.7, edgecolor='black')
    axes[1,0].set_xlabel('Residuos')
    axes[1,0].set_ylabel('Frecuencia')
    axes[1,0].set_title('Distribución de Residuos')

    # 4. Predicciones vs Valores reales
    axes[1,1].scatter(y_true, y_pred, alpha=0.6)
    min_val = min(y_true.min(), y_pred.min())
    max_val = max(y_true.max(), y_pred.max())
    axes[1,1].plot([min_val, max_val], [min_val, max_val], 'red', linestyle='--')
    axes[1,1].set_xlabel('Valores Reales')
    axes[1,1].set_ylabel('Predicciones')
    axes[1,1].set_title('Predicciones vs Valores Reales')

    plt.tight_layout()
    plt.show()

    # Estadísticas de residuos
    print(f"\nEstadísticas de residuos para {model_name}:")
    print(f"  - Media: {residuals.mean():.3f}")
    print(f"  - Std: {residuals.std():.3f}")
    print(f"  - Skewness: {stats.skew(residuals):.3f}")
    print(f"  - Kurtosis: {stats.kurtosis(residuals):.3f}")

# Análisis para mejores modelos
for target in ['eficiencia', 'fotoluminiscencia']:
    target_df = comparison_df[comparison_df['Target'] == target].copy()
    best_model_name = target_df.sort_values('Test_MAE').iloc[0]['Model']

    # Obtener predicciones del mejor modelo
    best_results = all_results[target][best_model_name]

    if target == 'eficiencia':
        y_true = y_test_eficiencia
    else:
        y_true = y_test_foto

    y_pred = best_results['predictions']['test']

    plot_residuals_analysis(y_true, y_pred, best_model_name, target)

7. ANÁLISIS DE IMPORTANCIA DE FEATURES

In [None]:
# ============================================================================
# 7. ANÁLISIS DE IMPORTANCIA DE FEATURES
# ============================================================================

print("\n7. ANALISIS DE IMPORTANCIA DE FEATURES...")

def plot_feature_importance(results, target_name, top_k=15):
    """Graficar importancia de features para modelos basados en árboles"""

    tree_models = ['RandomForest', 'GradientBoosting', 'XGBoost', 'LightGBM']

    fig, axes = plt.subplots(2, 2, figsize=(20, 15))
    axes = axes.flatten()

    for i, model_name in enumerate(tree_models):
        if model_name in results and i < 4:
            model = results[model_name]['model']

            if hasattr(model, 'feature_importances_'):
                if target_name == 'eficiencia':
                    feature_names = selected_features
                else:
                    feature_names = selected_features + ['eficiencia_fotosintetica_pct']

                # Crear DataFrame de importancia
                importance_df = pd.DataFrame({
                    'feature': feature_names,
                    'importance': model.feature_importances_
                }).sort_values('importance', ascending=False).head(top_k)

                # Graficar
                axes[i].barh(range(len(importance_df)), importance_df['importance'])
                axes[i].set_yticks(range(len(importance_df)))
                axes[i].set_yticklabels(importance_df['feature'])
                axes[i].set_xlabel('Importancia')
                axes[i].set_title(f'{model_name} - {target_name}')
                axes[i].invert_yaxis()

    plt.tight_layout()
    plt.show()

# Graficar importancia para ambos targets
for target in ['eficiencia', 'fotoluminiscencia']:
    if target in all_results:
        plot_feature_importance(all_results[target], target)