**Importar Librerias**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix
import os

**Sbbir archivo base**

In [None]:
from google.colab import files
uploaded = files.upload()

Saving bank-full.csv to bank-full.csv


In [None]:
df = pd.read_csv("bank-full.csv", sep=",")
df

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,nov,977,3,-1,0,unknown,yes
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,456,2,-1,0,unknown,yes
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3,success,yes
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0,unknown,no


**Preprocesamiento**

In [None]:
# Preprocesamiento básico y preparación de X, y
# Convertir target a binario
df = df.copy()
if df['y'].dtype == 'object':
    df['y_bin'] = (df['y'] == 'yes').astype(int)
else:
    df['y_bin'] = df['y']

# Seleccionar columnas predictoras (automatizado: numéricas y categóricas)
numeric_cols = df.select_dtypes(include=['int64','float64']).columns.tolist()
if 'y_bin' in numeric_cols:
    numeric_cols.remove('y_bin')
categorical_cols = df.select_dtypes(include=['object','category']).columns.tolist()
if 'y' in categorical_cols:
    categorical_cols.remove('y')

print('Numéricas:', numeric_cols)
print('Categoricas:', categorical_cols)

X = df[numeric_cols + categorical_cols]
y = df['y_bin']

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
print('Split realizado — tamaños:', X_train.shape, X_test.shape)

Numéricas: ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
Categoricas: ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']
Split realizado — tamaños: (36168, 16) (9043, 16)


**Selección de features**

In [None]:
#Pipeline: ColumnTransformer para preprocesar

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('ohe', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_cols),
    ('cat', categorical_transformer, categorical_cols)
])

# Feature selector: SelectKBest con mutual_info_classif (apta para mixto) y chi2 requiere no-negativos
selector = SelectKBest(score_func=mutual_info_classif, k=20)

clf = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)

pipe = Pipeline(steps=[('pre', preprocessor), ('select', selector), ('clf', clf)])

# Entrenamiento rápido
pipe.fit(X_train, y_train)
print('Pipeline entrenado.')

# Obtener features seleccionadas (nombres)
# Necesitamos obtener nombres de columnas tras OneHotEncoder
preprocessor.fit(X_train)
num_out = numeric_cols
# obtener ohe feature names (scikit-learn >=1.0)
try:
    cat_features = preprocessor.named_transformers_['cat'].named_steps['ohe'].get_feature_names_out(categorical_cols).tolist()
except Exception:
    # fallback
    cat_features = ['ohe_'+c for c in categorical_cols]
all_features = num_out + cat_features
mask = pipe.named_steps['select'].get_support()
selected_features = [f for f, m in zip(all_features, mask) if m]
print('\nFeatures seleccionadas ({}):'.format(len(selected_features)))
print(selected_features)

Pipeline entrenado.

Features seleccionadas (20):
['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous', 'marital_single', 'education_secondary', 'default_no', 'housing_no', 'housing_yes', 'loan_no', 'contact_cellular', 'contact_unknown', 'month_mar', 'month_may', 'month_sep', 'poutcome_success', 'poutcome_unknown']


**Métricas e informe**



In [None]:
# Predicciones y métricas en test set
y_pred = pipe.predict(X_test)
y_proba = pipe.predict_proba(X_test)[:,1] if hasattr(pipe.named_steps['clf'], 'predict_proba') or True else None

print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred, zero_division=0))
print('Recall:', recall_score(y_test, y_pred, zero_division=0))
print('F1:', f1_score(y_test, y_pred, zero_division=0))
if y_proba is not None:
    try:
        print('ROC AUC:', roc_auc_score(y_test, y_proba))
    except Exception as e:
        print('ROC AUC no disponible:', e)

print('\nClassification report:\n')
print(classification_report(y_test, y_pred, zero_division=0))
print('\nConfusion matrix:\n', confusion_matrix(y_test, y_pred))

Accuracy: 0.9020236647130377
Precision: 0.6331269349845201
Recall: 0.38657844990548207
F1: 0.4800469483568075
ROC AUC: 0.9127010947984937

Classification report:

              precision    recall  f1-score   support

           0       0.92      0.97      0.95      7985
           1       0.63      0.39      0.48      1058

    accuracy                           0.90      9043
   macro avg       0.78      0.68      0.71      9043
weighted avg       0.89      0.90      0.89      9043


Confusion matrix:
 [[7748  237]
 [ 649  409]]


In [None]:
# Cross-validation sobre pipeline (sin GridSearch para rapidez)
scores = cross_val_score(pipe, X, y, cv=5, scoring='f1')
print('F1 cross-val (5-fold):', scores, 'mean:', scores.mean())


F1 cross-val (5-fold): [0.05156538 0.09175121 0.1038206  0.11940299 0.35433751] mean: 0.14417553581306208


## Conclusiones

El modelo Random Forest obtuvo un buen rendimiento en la clase negativa (“no”), con una precisión del 92% y un recall del 97%. Sin embargo, el desempeño en la clase positiva (“yes”) fue deficiente, particularmente en el recall (38%), indicando que el modelo deja pasar la mayoría de los casos positivos. A pesar de ello, el ROC AUC de 0.91 demuestra que el modelo tiene una adecuada capacidad discriminativa, sugiriendo que el uso de umbrales alternativos podría mejorar la detección de la clase positiva.

La validación cruzada evidencia una alta variabilidad en el F1-score (media 0.14), lo cual indica que el modelo no generaliza de forma estable. Por lo tanto, se recomienda:

Aplicar técnicas de balanceo (SMOTE, undersampling, class_weight).

Ajustar k en SelectKBest mediante GridSearch.

Optimizar hiperparámetros del RandomForest.

Evaluar otros modelos (XGBoost, Logistic Regression con regularización).

En resumen, el modelo muestra un buen comportamiento en la clase mayoritaria, pero no es adecuado aún para predecir correctamente la clase minoritaria, que suele ser la más relevante en problemas de marketing o conversión.