### Pipeline con Transformaciones y Guardado

In [1]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import RobustScaler, OneHotEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
import numpy as np

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
clean_datos = pd.read_csv('bank_dataset_clean.csv')

In [3]:
clean_datos = clean_datos.drop(columns=['month'])

In [4]:
clean_datos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11162 entries, 0 to 11161
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   job        11162 non-null  object 
 1   marital    11162 non-null  object 
 2   education  11162 non-null  object 
 3   default    11162 non-null  object 
 4   balance    11162 non-null  float64
 5   housing    11162 non-null  object 
 6   loan       11162 non-null  object 
 7   deposit    11162 non-null  object 
dtypes: float64(1), object(7)
memory usage: 697.8+ KB


In [5]:
# Definir las características (X) y la variable objetivo (y)
X = clean_datos.drop('deposit', axis=1)  
y = clean_datos['deposit']

# Dividir en conjuntos de entrenamiento y prueba (70% - 30%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Verificar las formas de los conjuntos resultantes
print("Tamaño de X_train:", X_train.shape)
print("Tamaño de X_test:", X_test.shape)
print("Tamaño de y_train:", y_train.shape)
print("Tamaño de y_test:", y_test.shape)

Tamaño de X_train: (7813, 7)
Tamaño de X_test: (3349, 7)
Tamaño de y_train: (7813,)
Tamaño de y_test: (3349,)


In [6]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7813 entries, 7524 to 3617
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   job        7813 non-null   object 
 1   marital    7813 non-null   object 
 2   education  7813 non-null   object 
 3   default    7813 non-null   object 
 4   balance    7813 non-null   float64
 5   housing    7813 non-null   object 
 6   loan       7813 non-null   object 
dtypes: float64(1), object(6)
memory usage: 488.3+ KB


In [7]:
# Codificación de la variable objetivo
X_train['default_encoded'] = X_train['default'].map({'yes': 1, 'no': 0})

# Verificar los cambios
print(X_train[['default', 'default_encoded']].tail())

# Imprimir el valor en la fila 0 de la columna 'deposit'
print(X_train.loc[100, ['default', 'default_encoded']])
print(X_train.loc[11160, ['default', 'default_encoded']])

# Codificación de la variable objetivo
X_train['housing_encoded'] = X_train['housing'].map({'yes': 1, 'no': 0})

# Verificar los cambios
print(X_train[['housing', 'housing_encoded']].tail())

# Imprimir el valor en la fila 0 de la columna 'deposit'
print(X_train.loc[100, ['housing', 'housing_encoded']])
print(X_train.loc[11160, ['housing', 'housing_encoded']])

# Codificación de la variable objetivo
X_train['loan_encoded'] = X_train['loan'].map({'yes': 1, 'no': 0})

# Verificar los cambios
print(X_train[['loan', 'loan_encoded']].tail())

# Imprimir el valor en la fila 0 de la columna 'deposit'
print(X_train.loc[100, ['loan', 'loan_encoded']])
print(X_train.loc[11160, ['loan', 'loan_encoded']])

     default  default_encoded
9487      no                0
3221      no                0
9097      no                0
9743      no                0
3617      no                0
default            no
default_encoded     0
Name: 100, dtype: object
default            no
default_encoded     0
Name: 11160, dtype: object
     housing  housing_encoded
9487      no                0
3221      no                0
9097     yes                1
9743     yes                1
3617      no                0
housing            yes
housing_encoded      1
Name: 100, dtype: object
housing            no
housing_encoded     0
Name: 11160, dtype: object
     loan  loan_encoded
9487   no             0
3221   no             0
9097   no             0
9743  yes             1
3617   no             0
loan            no
loan_encoded     0
Name: 100, dtype: object
loan            yes
loan_encoded      1
Name: 11160, dtype: object


In [8]:
print(X_train['default_encoded'].value_counts())
print(X_train['housing_encoded'].value_counts())
print(X_train['loan_encoded'].value_counts())

default_encoded
0    7695
1     118
Name: count, dtype: int64
housing_encoded
0    4115
1    3698
Name: count, dtype: int64
loan_encoded
0    6810
1    1003
Name: count, dtype: int64


In [9]:
drop_col = ['default','housing','loan']

X_train = X_train.drop(columns=drop_col)
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7813 entries, 7524 to 3617
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   job              7813 non-null   object 
 1   marital          7813 non-null   object 
 2   education        7813 non-null   object 
 3   balance          7813 non-null   float64
 4   default_encoded  7813 non-null   int64  
 5   housing_encoded  7813 non-null   int64  
 6   loan_encoded     7813 non-null   int64  
dtypes: float64(1), int64(3), object(3)
memory usage: 746.4+ KB


In [10]:
# Columnas categóricas para OneHotEncoder
categorical_features = ['job', 'marital', 'education']

# Columnas binarias
binary_features = ['default_encoded', 'housing_encoded', 'loan_encoded']

# Columna numérica a escalar
numerical_features = ['balance']

# Crear el preprocesador
preprocessor = ColumnTransformer(
    transformers=[
        ('num', RobustScaler(), numerical_features),  # RobustScaler para 'balance'
        ('cat', OneHotEncoder(drop='first'), categorical_features),  # OneHotEncoder para categóricas
    ],
    remainder='passthrough'
)

# Crear y entrenar el escalador
X_train_transformed = preprocessor.fit_transform(X_train)

# Entrenar el modelo de clustering
kmeans = KMeans(n_clusters=2, random_state=42)
kmeans.fit(X_train_transformed)

# Guardar el preprocesador en un archivo .pkl
with open('preprocessor.pkl', 'wb') as preprocessor_file:
    pickle.dump(preprocessor, preprocessor_file)

# Guardar el modelo KMeans en un archivo .pkl
with open('kmeans_model.pkl', 'wb') as model_file:
    pickle.dump(kmeans, model_file)

print("Escalador y modelo KMeans guardados exitosamente en archivos separados.")

Escalador y modelo KMeans guardados exitosamente en archivos separados.


In [11]:
# Obtener los nombres de las columnas de OneHotEncoder
ohe_feature_names = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)

# Obtener todos los nombres de columnas después de la transformación
all_feature_names = list(ohe_feature_names) + binary_features + numerical_features

# Ver los nombres de las columnas transformadas
print(all_feature_names)

['job_blue-collar', 'job_entrepreneur', 'job_housemaid', 'job_management', 'job_retired', 'job_self-employed', 'job_services', 'job_student', 'job_technician', 'job_unemployed', 'job_unknown', 'marital_married', 'marital_single', 'education_secondary', 'education_tertiary', 'education_unknown', 'default_encoded', 'housing_encoded', 'loan_encoded', 'balance']
