In [None]:
# PIPELINE COMPLETO DE EXPLORACIÓN, LIMPIEZA Y TRANSFORMACIÓN DE DATOS

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, PowerTransformer, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

# 1. CARGA DE DATOS
df = pd.read_csv("tus_datos.csv")  # Cambiar por tu dataset

# 2. EXPLORACIÓN INICIAL
print("\n🔎 Dimensiones:", df.shape)
print("\n🧾 Tipos de datos:")
print(df.dtypes)
print("\n❓ Valores nulos:")
print(df.isnull().sum())
sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
plt.title("Mapa de nulos")
plt.show()

# Distribuciones
num_cols = df.select_dtypes(include=[np.number]).columns
for col in num_cols:
    sns.histplot(df[col], kde=True)
    plt.title(f"Distribución: {col}")
    plt.show()

# 3. DETECCIÓN Y FILTRADO DE OUTLIERS (Z-Score)
from scipy.stats import zscore
z_scores = np.abs(zscore(df[num_cols]))
threshold = 3
outliers = (z_scores > threshold).any(axis=1)
print(f"\n⚠️ Se eliminarán {outliers.sum()} outliers")
df = df[~outliers]

# 4. SEPARAR VARIABLES
cat_cols = df.select_dtypes(include='object').columns.tolist()
num_cols = df.select_dtypes(include=np.number).columns.tolist()

# Suponiendo que 'target' es la variable objetivo
y = df['target']
X = df.drop('target', axis=1)

# 5. DEFINIR TRANSFORMACIONES
# Numéricas: Imputar + Escalar + Transformar
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('power', PowerTransformer())
])

# Categóricas: Imputar + One-hot encoding
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Combinar todo
preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])

# 6. APLICAR TRANSFORMACIONES
pipeline = Pipeline([
    ('preprocessing', preprocessor)
])

# 7. DIVISIÓN EN TRAIN/TEST
X_train_raw, X_test_raw, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 8. TRANSFORMAR DATOS
X_train = pipeline.fit_transform(X_train_raw)
X_test = pipeline.transform(X_test_raw)

print("\n✅ Datos listos para modelado. Shape final:", X_train.shape)
