In [None]:
# ML PIPELINE PARA NUEVO EQUIPO 🚀

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline

# 1. CARGA DE DATOS
df = pd.read_csv("tus_datos.csv")  # Cambiar por tu dataset

# 2. EDA BÁSICO
print(df.info())
print(df.describe())
print(df.isnull().sum())
sns.pairplot(df.select_dtypes(include=['float64', 'int64']))
plt.show()

# 3. PREPROCESADO
df = df.dropna()

# Separar features y target (ajusta 'target')
X = df.drop('target', axis=1)
y = df['target']

# Escalado y transformación
scaler = StandardScaler()
power = PowerTransformer(method='yeo-johnson')
X_scaled = scaler.fit_transform(X)
X_transformed = power.fit_transform(X_scaled)

# 4. DIVISIÓN TRAIN/TEST
X_train, X_test, y_train, y_test = train_test_split(
    X_transformed, y, test_size=0.2, random_state=42, stratify=y
)

# 5. SELECCIÓN DE MODELOS
modelos = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Árbol de Decisión": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier()
}

# 6. ENTRENAMIENTO Y EVALUACIÓN
for nombre, modelo in modelos.items():
    modelo.fit(X_train, y_train)
    y_pred = modelo.predict(X_test)
    print(f"\n📌 Modelo: {nombre}")
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))

# 7. AJUSTE DE HIPERPARÁMETROS PARA LOGISTIC REGRESSION
param_grid = {
    'C': [0.01, 0.1, 1.0, 10.0, 100.0],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}
grid_log = GridSearchCV(LogisticRegression(max_iter=1000), param_grid, cv=5, scoring='f1_macro')
grid_log.fit(X_train, y_train)
print("\n🔧 Mejor Logistic Regression:", grid_log.best_params_)

# 8. VALIDACIÓN CRUZADA
score = cross_val_score(grid_log.best_estimator_, X_train, y_train, cv=5, scoring='f1_macro')
print(f"\n🔁 Validación cruzada (LogReg): Media F1: {score.mean():.3f} +/- {score.std():.3f}")

# 9. PIPELINE FINAL EJEMPLO COMPLETO CON RANDOM FOREST
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('transform', PowerTransformer()),
    ('model', RandomForestClassifier(n_estimators=100, random_state=42))
])

pipe.fit(X_train, y_train)
y_pred_pipe = pipe.predict(X_test)
print("\n🚀 Pipeline Random Forest - Reporte de Clasificación:")
print(classification_report(y_test, y_pred_pipe))
