Incluye ahora:

✔ Activación correcta de IterativeImputer

✔ Verificación de columnas reales

✔ Ingeniería de nuevas características útiles y seguras:

Alone_vs_Friends

Outdoor_Activity

Posting_vs_Friends

✔ Modelo StackingClassifier con RandomForest optimizado vía BayesSearchCV

✔ Archivo submission_stacking.csv listo para subir

In [6]:
# -----------------------------
# PREDICT INTROVERT VS EXTROVERT - STACKING MODEL (VALIDADO)
# -----------------------------

# 📦 LIBRERÍAS
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.experimental import enable_iterative_imputer  # habilita imputador
from sklearn.impute import IterativeImputer
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, f1_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from skopt import BayesSearchCV
import warnings
warnings.filterwarnings("ignore")

# -----------------------------
# 📥 CARGA Y PREPROCESAMIENTO
# -----------------------------
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# Guardamos ID del test
test_ids = test_df['id']

# 🔧 Unificar preprocesamiento
def preprocess(df):
    df = df.copy()
    df = df.drop('id', axis=1, errors='ignore')
    df['Stage_fear'] = df['Stage_fear'].map({'Yes': 1, 'No': 0})
    df['Drained_after_socializing'] = df['Drained_after_socializing'].map({'Yes': 1, 'No': 0})
    return df

train_df = preprocess(train_df)
test_df = preprocess(test_df)

# 🎯 Codificar la variable objetivo
le = LabelEncoder()
y = le.fit_transform(train_df['Personality'])
X = train_df.drop('Personality', axis=1)

# 🔢 Imputación de valores faltantes
imputer = IterativeImputer(random_state=42)
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
test_imputed = pd.DataFrame(imputer.transform(test_df), columns=test_df.columns)

# ⚙️ Ingeniería de características útil y segura
X_imputed['Alone_vs_Friends'] = X_imputed['Time_spent_Alone'] / (X_imputed['Friends_circle_size'] + 1)
test_imputed['Alone_vs_Friends'] = test_imputed['Time_spent_Alone'] / (test_imputed['Friends_circle_size'] + 1)

X_imputed['Outdoor_Activity'] = X_imputed['Going_outside'] * X_imputed['Social_event_attendance']
test_imputed['Outdoor_Activity'] = test_imputed['Going_outside'] * test_imputed['Social_event_attendance']

X_imputed['Posting_vs_Friends'] = X_imputed['Post_frequency'] / (X_imputed['Friends_circle_size'] + 1)
test_imputed['Posting_vs_Friends'] = test_imputed['Post_frequency'] / (test_imputed['Friends_circle_size'] + 1)

# 🧼 Escalado
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X_imputed), columns=X_imputed.columns)
test_scaled = pd.DataFrame(scaler.transform(test_imputed), columns=test_imputed.columns)

# -----------------------------
# 🔍 OPTIMIZACIÓN DE RANDOM FOREST
# -----------------------------
rf_search = BayesSearchCV(
    estimator=RandomForestClassifier(random_state=42, class_weight='balanced'),
    search_spaces={
        'n_estimators': (100, 1000),
        'max_depth': (5, 50),
        'min_samples_leaf': (1, 10),
    },
    n_iter=25,
    scoring='accuracy',
    cv=3,
    n_jobs=-1,
    random_state=42,
    verbose=0
)
rf_search.fit(X_scaled, y)
best_rf = rf_search.best_estimator_

# -----------------------------
# 🤖 STACKING ENSEMBLE
# -----------------------------
estimators = [
    ('rf', best_rf),
    ('xgb', XGBClassifier(eval_metric='logloss', use_label_encoder=False, random_state=42)),
    ('lgbm', LGBMClassifier(random_state=42))
]

stack_model = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(),
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    n_jobs=-1
)

# Entrenar modelo final
stack_model.fit(X_scaled, y)

# -----------------------------
# ✅ VALIDACIÓN CRUZADA (Métricas Clave)
# -----------------------------
print("\n🔎 Validación cruzada (accuracy):", cross_val_score(stack_model, X_scaled, y, cv=5, scoring='accuracy').mean())
print("🔎 Validación cruzada (F1 macro):", cross_val_score(stack_model, X_scaled, y, cv=5, scoring='f1_macro').mean())
print("🔎 Validación cruzada (ROC AUC OvR):", cross_val_score(stack_model, X_scaled, y, cv=5, scoring='roc_auc_ovr').mean())

# -----------------------------
# 📤 PREDICCIÓN Y SUBMISSION
# -----------------------------
y_pred_test = stack_model.predict(test_scaled)
preds_labels = le.inverse_transform(y_pred_test.astype(int))

submission = pd.DataFrame({
    'id': test_ids,
    'Personality': preds_labels
})
submission.to_csv("submission_stacking_4.csv", index=False)

print("\n✅ Archivo 'submission_stacking.csv' generado y listo para subir a Kaggle")



🔎 Validación cruzada (accuracy): 0.969067441406307
🔎 Validación cruzada (F1 macro): 0.9597250848239203
🔎 Validación cruzada (ROC AUC OvR): 0.9662781166508765

✅ Archivo 'submission_stacking.csv' generado y listo para subir a Kaggle
