In [134]:
# importamos las librerías necesarias
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [135]:
# importamos el dataset

df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [136]:
# Realizamos una vista previa de los datos
df_train.head()

Unnamed: 0,id,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,0,42,technician,married,secondary,no,7,no,no,cellular,25,aug,117,3,-1,0,unknown,0
1,1,38,blue-collar,married,secondary,no,514,no,no,unknown,18,jun,185,1,-1,0,unknown,0
2,2,36,blue-collar,married,secondary,no,602,yes,no,unknown,14,may,111,2,-1,0,unknown,0
3,3,27,student,single,secondary,no,34,yes,no,unknown,28,may,10,2,-1,0,unknown,0
4,4,26,technician,married,secondary,no,889,yes,no,cellular,3,feb,902,1,-1,0,unknown,1


In [137]:
# Tomamos mas información de los datos
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 18 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   id         750000 non-null  int64 
 1   age        750000 non-null  int64 
 2   job        750000 non-null  object
 3   marital    750000 non-null  object
 4   education  750000 non-null  object
 5   default    750000 non-null  object
 6   balance    750000 non-null  int64 
 7   housing    750000 non-null  object
 8   loan       750000 non-null  object
 9   contact    750000 non-null  object
 10  day        750000 non-null  int64 
 11  month      750000 non-null  object
 12  duration   750000 non-null  int64 
 13  campaign   750000 non-null  int64 
 14  pdays      750000 non-null  int64 
 15  previous   750000 non-null  int64 
 16  poutcome   750000 non-null  object
 17  y          750000 non-null  int64 
dtypes: int64(9), object(9)
memory usage: 103.0+ MB


In [138]:
# Separamos las variables predictoras y la variable objetivo 

X_train = df_train.drop(columns=['id','y'])
y_train = df_train['y']


In [139]:
# Separamos las variables categóricas y numéricas
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()
numerical_features = X_train.select_dtypes(include=['number']).columns.tolist()

# Imprimimos las variables categóricas y numéricas
print("Categorical Features:", categorical_features)
print("Numerical Features:", numerical_features)

X_train_cat = X_train[categorical_features]
X_train_num = X_train[numerical_features]

X_train_cat.isnull().sum()


Categorical Features: ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']
Numerical Features: ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']


job          0
marital      0
education    0
default      0
housing      0
loan         0
contact      0
month        0
poutcome     0
dtype: int64

In [140]:
# Realizamos un pipeline para el preprocesamiento de las variables categóricas
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder

# Definimos el preprocesador para las variables categóricas
categorical_transformer = Pipeline(steps=[
    ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

# aplicamos el preprocesador a las variables categóricas
preprocessor = ColumnTransformer(
    transformers= [
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough',  # Mantiene las variables numéricas sin cambios
    verbose_feature_names_out=False
)
# Ajustamos el preprocesador a las variables categóricas
X_train_cat = preprocessor.fit_transform(X_train_cat)



In [141]:
# Visualizamos el resultado de las variables categóricas
X_train_cat_df = pd.DataFrame(X_train_cat, columns=preprocessor.get_feature_names_out(categorical_features))
X_train_cat_df.head()


Unnamed: 0,job,marital,education,default,housing,loan,contact,month,poutcome
0,9.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,3.0
1,1.0,1.0,1.0,0.0,0.0,0.0,2.0,6.0,3.0
2,1.0,1.0,1.0,0.0,1.0,0.0,2.0,8.0,3.0
3,8.0,2.0,1.0,0.0,1.0,0.0,2.0,8.0,3.0
4,9.0,1.0,1.0,0.0,1.0,0.0,0.0,3.0,3.0


In [142]:
# unimos las variables categóricas y numéricas
X_train_processed = pd.concat([X_train_cat_df, X_train_num.reset_index(drop=True)], axis=1)
# Visualizamos el dataframe final
X_train_processed.head()

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,poutcome,age,balance,day,duration,campaign,pdays,previous
0,9.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,3.0,42,7,25,117,3,-1,0
1,1.0,1.0,1.0,0.0,0.0,0.0,2.0,6.0,3.0,38,514,18,185,1,-1,0
2,1.0,1.0,1.0,0.0,1.0,0.0,2.0,8.0,3.0,36,602,14,111,2,-1,0
3,8.0,2.0,1.0,0.0,1.0,0.0,2.0,8.0,3.0,27,34,28,10,2,-1,0
4,9.0,1.0,1.0,0.0,1.0,0.0,0.0,3.0,3.0,26,889,3,902,1,-1,0


In [None]:
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK, space_eval
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, roc_auc_score
import numpy as np
from sklearn.ensemble import RandomForestClassifier

# Especificamos el espacio de búsqueda para los hiperparámetros
space = {
    'n_estimators': hp.choice('n_estimators', [100, 200, 300]),
    'max_depth': hp.choice('max_depth', [10, 20, 30]),
    'min_samples_split': hp.uniform('min_samples_split', 2, 10),
    'min_samples_leaf': hp.uniform('min_samples_leaf', 1, 5)
}

# realizamos la función objetivo para la optimización
def objective(params):
    params['min_samples_split'] = int(params['min_samples_split'])
    params['min_samples_leaf'] = int(params['min_samples_leaf'])
    
    clf = RandomForestClassifier(**params, random_state=42, n_jobs=-1)
    
    # Use cross_validate with full dataset
    scores = cross_validate(
        clf, 
        X_train_processed,  # Using full dataset
        y_train,           # Using full dataset
        cv=3, 
        scoring={'accuracy': 'accuracy', 'roc_auc': 'roc_auc'},
        n_jobs=-1
    )
    
    # Calculate mean scores
    accuracy = scores['test_accuracy'].mean()
    auc = scores['test_roc_auc'].mean()
    score = (0.5 * accuracy) + (0.5 * auc)
    
    print(f"Params: {params}")
    print(f"Accuracy: {accuracy:.4f}, AUC: {auc:.4f}, Combined: {score:.4f}")
    
    return {'loss': -score, 'status': STATUS_OK}



# Corremos la optimización de hiperparámetros
trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=30,  
            trials=trials)

# encontramos los mejores hiperparámetros 
best_params = space_eval(space, best)
print("\nBest parameters found:", best_params)

# entrenamos el modelo final con los mejores hiperparámetros
model = RandomForestClassifier(**best_params, random_state=42)
model.fit(X_train_processed, y_train)

  0%|          | 0/30 [00:00<?, ?trial/s, best loss=?]