In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Configuración de visualización
plt.style.use('default')
sns.set_palette("husl")

print("="*60)
print("PREPROCESAMIENTO DE DATOS - AGRICULTURA VERTICAL")
print("="*60)

1. CARGA DE DATOS

In [None]:
print("\n1. CARGANDO DATOS...")
df = pd.read_csv('../data/raw/agricultura_vertical_dataset.csv')

print(f"Shape original: {df.shape}")
print(f"Columnas: {list(df.columns)}")

# Copia para comparaciones
df_original = df.copy()

2. ANÁLISIS DE CALIDAD DE DATOS

In [None]:
print("\n2. ANALISIS DE CALIDAD DE DATOS...")

def analyze_data_quality(df):
    """Análisis completo de calidad de datos"""
    quality_report = pd.DataFrame({
        'Column': df.columns,
        'Dtype': df.dtypes,
        'Non_Null_Count': df.count(),
        'Null_Count': df.isnull().sum(),
        'Null_Percentage': (df.isnull().sum() / len(df)) * 100,
        'Unique_Count': df.nunique(),
        'Duplicate_Rows': len(df) - len(df.drop_duplicates())
    })

    return quality_report

quality_report = analyze_data_quality(df)
print("\nREPORTE DE CALIDAD:")
print(quality_report)

# Verificar duplicados
duplicates = df.duplicated().sum()
print(f"\nFilas duplicadas: {duplicates}")

if duplicates > 0:
    print("Eliminando duplicados...")
    df = df.drop_duplicates().reset_index(drop=True)
    print(f"Shape después de eliminar duplicados: {df.shape}")


In [None]:
3. MANEJO DE VALORES FALTANTES

In [None]:
# ============================================================================
# 3. MANEJO DE VALORES FALTANTES
# ============================================================================

print("\n3. MANEJO DE VALORES FALTANTES...")

# Identificar patrones de valores faltantes
missing_data = df.isnull().sum()
missing_percentage = (missing_data / len(df)) * 100

missing_df = pd.DataFrame({
    'Missing_Count': missing_data,
    'Missing_Percentage': missing_percentage
}).sort_values('Missing_Percentage', ascending=False)

print("\nVALORES FALTANTES POR COLUMNA:")
print(missing_df[missing_df['Missing_Count'] > 0])

# Estrategias de imputación
def handle_missing_values(df):
    """Manejo inteligente de valores faltantes"""
    df_processed = df.copy()

    # Para variables numéricas - usar mediana (más robusta a outliers)
    numeric_cols = df_processed.select_dtypes(include=[np.number]).columns

    for col in numeric_cols:
        if df_processed[col].isnull().sum() > 0:
            median_value = df_processed[col].median()
            df_processed[col].fillna(median_value, inplace=True)
            print(f"  - {col}: {df[col].isnull().sum()} valores imputados con mediana {median_value:.2f}")

    # Para variables categóricas - usar moda
    categorical_cols = df_processed.select_dtypes(include=['object']).columns

    for col in categorical_cols:
        if df_processed[col].isnull().sum() > 0:
            mode_value = df_processed[col].mode()[0]
            df_processed[col].fillna(mode_value, inplace=True)
            print(f"  - {col}: {df[col].isnull().sum()} valores imputados con moda '{mode_value}'")

    return df_processed

# Aplicar manejo de valores faltantes
if missing_data.sum() > 0:
    df = handle_missing_values(df)
else:
    print("No se encontraron valores faltantes.")
