In [None]:
# ============================================================================
# 02_FEATURE_ENGINEERING.IPYNB
# Ingeniería de Características para Agricultura Vertical
# ============================================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Configuración de visualización
plt.style.use('default')
sns.set_palette("husl")

print("="*60)
print("INGENIERIA DE CARACTERISTICAS - AGRICULTURA VERTICAL")
print("="*60)

1. CARGA DE DATOS PROCESADOS

In [None]:
# ============================================================================
# 1. CARGA DE DATOS PROCESADOS
# ============================================================================

print("\n1. CARGANDO DATOS PROCESADOS...")

# Cargar datos procesados
df = pd.read_csv('../data/processed/dataset_processed.csv')
train_data = pd.read_csv('../data/processed/train_data.csv')
val_data = pd.read_csv('../data/processed/validation_data.csv')
test_data = pd.read_csv('../data/processed/test_data.csv')

# Cargar metadata
import json
with open('../data/processed/preprocessing_metadata.json', 'r') as f:
    metadata = json.load(f)

print(f"Dataset completo: {df.shape}")
print(f"Entrenamiento: {train_data.shape}")
print(f"Validacion: {val_data.shape}")
print(f"Test: {test_data.shape}")

# Features base identificadas
base_features = metadata['feature_columns_eficiencia']
print(f"\nFeatures base: {len(base_features)}")
print(base_features)

2. ANÁLISIS DE CORRELACIONES AVANZADO

In [None]:
# ============================================================================
# 2. ANÁLISIS DE CORRELACIONES AVANZADO
# ============================================================================

print("\n2. ANALISIS DE CORRELACIONES AVANZADO...")

def advanced_correlation_analysis(df):
    """Análisis avanzado de correlaciones"""

    # Variables numéricas
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

    # Correlación Pearson (lineal)
    corr_pearson = df[numeric_cols].corr()

    # Correlación Spearman (monotónica)
    corr_spearman = df[numeric_cols].corr(method='spearman')

    # Visualizar correlaciones
    fig, axes = plt.subplots(1, 2, figsize=(20, 8))

    # Pearson
    mask = np.triu(np.ones_like(corr_pearson, dtype=bool))
    sns.heatmap(corr_pearson, mask=mask, annot=True, cmap='coolwarm',
                center=0, square=True, ax=axes[0])
    axes[0].set_title('Correlación Pearson (Lineal)')

    # Spearman
    mask = np.triu(np.ones_like(corr_spearman, dtype=bool))
    sns.heatmap(corr_spearman, mask=mask, annot=True, cmap='coolwarm',
                center=0, square=True, ax=axes[1])
    axes[1].set_title('Correlación Spearman (Monotónica)')

    plt.tight_layout()
    plt.show()

    # Identificar correlaciones fuertes con targets
    target_cols = ['eficiencia_fotosintetica_pct', 'fotoluminiscencia_intensidad']

    correlations_summary = {}
    for target in target_cols:
        if target in df.columns:
            pearson_corr = df[numeric_cols].corrwith(df[target]).abs().sort_values(ascending=False)
            spearman_corr = df[numeric_cols].corrwith(df[target], method='spearman').abs().sort_values(ascending=False)

            correlations_summary[target] = {
                'pearson': pearson_corr.head(10).to_dict(),
                'spearman': spearman_corr.head(10).to_dict()
            }

    return corr_pearson, corr_spearman, correlations_summary

corr_pearson, corr_spearman, correlations_summary = advanced_correlation_analysis(df)

# Mostrar correlaciones más fuertes
print("\nCORRELACIONES MAS FUERTES CON TARGETS:")
for target, corrs in correlations_summary.items():
    print(f"\n{target.upper()}:")
    print("  Pearson (Top 5):")
    for var, corr in list(corrs['pearson'].items())[:5]:
        if var != target:
            print(f"    {var}: {corr:.3f}")


3. CREACIÓN DE FEATURES AVANZADAS

In [None]:
# ============================================================================
# 3. CREACIÓN DE FEATURES AVANZADAS
# ============================================================================

print("\n3. CREANDO FEATURES AVANZADAS...")

def create_advanced_features(df):
    """Crear features avanzadas basadas en conocimiento del dominio"""
    df_featured = df.copy()

    print("  Creando features de interacciones...")

    # 3.1 INTERACCIONES MULTIPLICATIVAS
    # Interacciones importantes para fotosíntesis
    if all(col in df_featured.columns for col in ['temperatura_c', 'co2_ppm']):
        df_featured['temp_co2_interaction'] = df_featured['temperatura_c'] * df_featured['co2_ppm']

    if all(col in df_featured.columns for col in ['par_umol_m2_s', 'co2_ppm']):
        df_featured['par_co2_interaction'] = df_featured['par_umol_m2_s'] * df_featured['co2_ppm']

    if all(col in df_featured.columns for col in ['temperatura_c', 'par_umol_m2_s']):
        df_featured['temp_par_interaction'] = df_featured['temperatura_c'] * df_featured['par_umol_m2_s']

    if all(col in df_featured.columns for col in ['humedad_rel_pct', 'vpd_kpa']):
        df_featured['humedad_vpd_interaction'] = df_featured['humedad_rel_pct'] * df_featured['vpd_kpa']

    print("  Creando features de ratios...")

    # 3.2 RATIOS SIGNIFICATIVOS
    if all(col in df_featured.columns for col in ['co2_ppm', 'temperatura_c']):
        df_featured['co2_temp_ratio'] = df_featured['co2_ppm'] / (df_featured['temperatura_c'] + 273.15)  # Temperatura absoluta

    if all(col in df_featured.columns for col in ['par_umol_m2_s', 'temperatura_c']):
        df_featured['par_temp_ratio'] = df_featured['par_umol_m2_s'] / df_featured['temperatura_c']

    if all(col in df_featured.columns for col in ['aqi_indice', 'par_umol_m2_s']):
        df_featured['calidad_luz_ratio'] = df_featured['aqi_indice'] / (df_featured['par_umol_m2_s'] + 1)

    print("  Creando features de distancias optimales...")

    # 3.3 DISTANCIAS DE CONDICIONES ÓPTIMAS
    # Rangos óptimos para agricultura vertical
    optimal_conditions = {
        'temperatura_optima': 23,
        'humedad_optima': 65,
        'co2_optimo': 800,
        'par_optimo': 450,
        'vpd_optimo': 1.2
    }

    for var, optimal in optimal_conditions.items():
        base_var = var.replace('_optima', '').replace('_optimo', '')
        if f'{base_var}_c' in df_featured.columns:
            df_featured[f'{base_var}_distancia_optima'] = np.abs(df_featured[f'{base_var}_c'] - optimal)
        elif f'{base_var}_rel_pct' in df_featured.columns:
            df_featured[f'{base_var}_distancia_optima'] = np.abs(df_featured[f'{base_var}_rel_pct'] - optimal)
        elif f'{base_var}_ppm' in df_featured.columns:
            df_featured[f'{base_var}_distancia_optima'] = np.abs(df_featured[f'{base_var}_ppm'] - optimal)
        elif f'{base_var}_umol_m2_s' in df_featured.columns:
            df_featured[f'{base_var}_distancia_optima'] = np.abs(df_featured[f'{base_var}_umol_m2_s'] - optimal)
        elif f'{base_var}_kpa' in df_featured.columns:
            df_featured[f'{base_var}_distancia_optima'] = np.abs(df_featured[f'{base_var}_kpa'] - optimal)

    print("  Creando indices compuestos...")

    # 3.4 ÍNDICES COMPUESTOS
    # Índice de estrés ambiental
    if all(col in df_featured.columns for col in ['pm2_5_ugm3', 'vocs_mgm3']):
        df_featured['indice_estres_ambiental'] = (
            df_featured['pm2_5_ugm3'] / 50 + df_featured['vocs_mgm3'] / 2
        ) / 2

    # Índice de condiciones de luz
    if all(col in df_featured.columns for col in ['par_umol_m2_s', 'aqi_indice']):
        df_featured['indice_condiciones_luz'] = (
            df_featured['par_umol_m2_s'] / 800 * df_featured['aqi_indice'] / 100
        )

    print("  Creando features temporales...")

    # 3.5 FEATURES TEMPORALES AVANZADAS
    if 'hora_dia' in df_featured.columns:
        # Funciones trigonométricas para capturar ciclicidad
        df_featured['hora_sin'] = np.sin(2 * np.pi * df_featured['hora_dia'] / 24)
        df_featured['hora_cos'] = np.cos(2 * np.pi * df_featured['hora_dia'] / 24)

        # Periodo de luz vs oscuridad
        df_featured['periodo_luz'] = ((df_featured['hora_dia'] >= 6) & (df_featured['hora_dia'] <= 18)).astype(int)

        # Intensidad de luz por hora (simulada)
        df_featured['intensidad_luz_hora'] = np.where(
            (df_featured['hora_dia'] >= 6) & (df_featured['hora_dia'] <= 18),
            np.sin(np.pi * (df_featured['hora_dia'] - 6) / 12),
            0
        )

    if 'dia_semana' in df_featured.columns:
        df_featured['dia_sin'] = np.sin(2 * np.pi * df_featured['dia_semana'] / 7)
        df_featured['dia_cos'] = np.cos(2 * np.pi * df_featured['dia_semana'] / 7)
        df_featured['es_fin_semana'] = (df_featured['dia_semana'] >= 5).astype(int)

    if 'mes' in df_featured.columns:
        df_featured['mes_sin'] = np.sin(2 * np.pi * df_featured['mes'] / 12)
        df_featured['mes_cos'] = np.cos(2 * np.pi * df_featured['mes'] / 12)

    print("  Creando features de transformaciones...")

    # 3.6 TRANSFORMACIONES NO LINEALES
    # Log transformaciones para variables con distribución sesgada
    skewed_vars = ['pm2_5_ugm3', 'vocs_mgm3', 'co2_ppm']
    for var in skewed_vars:
        if var in df_featured.columns:
            df_featured[f'{var}_log'] = np.log1p(df_featured[var])  # log(1+x) para evitar log(0)

    # Transformaciones cuadráticas para relaciones no lineales
    quadratic_vars = ['temperatura_c', 'par_umol_m2_s']
    for var in quadratic_vars:
        if var in df_featured.columns:
            df_featured[f'{var}_squared'] = df_featured[var] ** 2

    # Transformaciones de raíz cuadrada
    sqrt_vars = ['co2_ppm', 'par_umol_m2_s']
    for var in sqrt_vars:
        if var in df_featured.columns:
            df_featured[f'{var}_sqrt'] = np.sqrt(df_featured[var])

    return df_featured

# Aplicar feature engineering
df_featured = create_advanced_features(df)

print(f"\nFeatures creadas: {len(df_featured.columns) - len(df.columns)}")
print(f"Total features: {len(df_featured.columns)}")