In [1]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cluster import KMeans
from sklearn.compose import ColumnTransformer
import pandas as pd
import joblib


In [3]:
df = pd.read_csv("data/COLL_TEC_CONSOLIDADO.txt", delimiter=",", encoding="latin-1", low_memory=False)
df_copy = df.copy()


In [4]:
X = df_copy.drop(columns='Variable_objetivo')
y = df_copy['Variable_objetivo']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [5]:
print(len(X_train))
print(len(y_test))

1031904
257977


In [6]:
def limpieza_sin_categoricas(df_copy):
    df_copy = df_copy.copy()

    ## Convertir columnas que empiezan con "Fecha" o "Prox" a tipo fecha
    for column in df_copy.columns:
        if column.startswith('Fecha') or column.startswith('Prox'):
            df_copy[column] = pd.to_datetime(df_copy[column], errors='coerce')

    # Agregar las 6 columnas con valor inicial en 1
    for i in range(1, 7):
        df_copy[f'Activo_M{i}'] = 1

    meses = range(1, 7)
    for j in meses:
        columnas_check = [f'Saldo_total_M{k}' for k in range(j, 7)]
        columnas_fill = [
            f'Ciclo_atraso_M{j}', f'Pago_M{j}', f'Fecha_pago_M{j}', f'Utilizacion_M{j}',
            f'Fecha_corte_M{j}', f'Fecha_limite_pago_M{j}'
        ]
        mask_nulos = df_copy[columnas_check].isnull().all(axis=1)
        df_copy.loc[mask_nulos, columnas_fill] = df_copy.loc[mask_nulos, columnas_fill].fillna(0)
        df_copy.loc[mask_nulos, f'Activo_M{j}'] = 0

    if 'Fecha_prox_corte_M1' in df_copy.columns:
        mask_fecha_prox = df_copy[[f'Saldo_total_M{k}' for k in meses]].isnull().all(axis=1)
        df_copy.loc[mask_fecha_prox, 'Fecha_prox_corte_M1'] = 0

    # Drop columnas 'Behavior'
    df_copy = df_copy.drop(df_copy.filter(regex='Behavior').columns, axis=1)

    # Canal_Pago y Canal_Pago_M1-M6
    columnas_canal_pago = [f'Canal_Pago_M{j}' for j in meses]
    df_copy[columnas_canal_pago] = df_copy[columnas_canal_pago].fillna('Desconocido')
    df_copy['Moda_Canal_Pago'] = df_copy[columnas_canal_pago].mode(axis=1)[0]
    df_copy['Canal_Pago'].fillna(df_copy['Moda_Canal_Pago'], inplace=True)
    df_copy.drop(columns=['Moda_Canal_Pago'], inplace=True)

    # Eliminar columnas de Fecha_prox_corte_M2-M6 si coinciden con fechas corte previas
    mask_fecha_igual = False
    for i in range(2, 7):
        if f'Fecha_prox_corte_M{i}' in df_copy.columns and f'Fecha_corte_M{i-1}' in df_copy.columns:
            mask_fecha_igual |= (df_copy[f'Fecha_prox_corte_M{i}'] == df_copy[f'Fecha_corte_M{i-1}'])

    df_copy = df_copy[mask_fecha_igual]
    columnas_a_eliminar = [f'Fecha_prox_corte_M{i}' for i in range(2, 7)]
    df_copy.drop(columns=[col for col in columnas_a_eliminar if col in df_copy.columns], inplace=True)

    # Convertir fechas a días desde fecha base
    fecha_base = pd.to_datetime('01/01/01')
    for i in range(1, 7):
        for col in [f'Fecha_corte_M{i}', f'Fecha_limite_pago_M{i}', f'Fecha_pago_M{i}']:
            if col in df_copy.columns:
                mask_zeros = df_copy[col] == 0
                df_copy[col] = pd.to_datetime(df_copy[col], errors='coerce')
                df_copy[col] = (df_copy[col] - fecha_base).dt.days
                df_copy.loc[mask_zeros, col] = 0

    # Saldo_total_M1, Saldo_Mes_M1, Pago_minimo_M1
    mask_nulos_m1 = df_copy[['Saldo_total_M1', 'Saldo_Mes_M1', 'Pago_minimo_M1']].isnull().all(axis=1)
    columnas_m1 = df_copy.columns[df_copy.columns.str.contains('M1')]
    df_copy.loc[mask_nulos_m1, columnas_m1] = df_copy.loc[mask_nulos_m1, columnas_m1].fillna(0)

    # Eliminar columnas innecesarias
    df_copy.drop(df_copy.filter(regex='Genero').columns, axis=1, inplace=True)
    df_copy.drop(df_copy.filter(regex='Fecha_pago').columns, axis=1, inplace=True)
    for i in range(1, 7):
        df_copy.drop(df_copy.filter(regex=f'Pago_M{i}').columns, axis=1, inplace=True)
    df_copy.drop(columns=['Pago'], errors='ignore', inplace=True)

    # Crear columnas Deuda_M1 a Deuda_M6
    for i in range(1, 7):
        df_copy[f'Deuda_M{i}'] = df_copy['Limite_credito'] * df_copy[f'Utilizacion_M{i}']

    # Eliminar filas con datos faltantes en columnas críticas
    columnas_a_verificar = sum([[f'Saldo_total_M{i}', f'Saldo_Mes_M{i}', f'Pago_minimo_M{i}'] for i in range(1, 7)], [])
    df_copy = df_copy.dropna(subset=columnas_a_verificar)
    df_copy.dropna(inplace=True)

    # Convertir columnas tipo fecha restantes a días desde fecha_base
    for column in df_copy.columns:
        if column.startswith('Fecha') and not column.endswith(('M1', 'M2', 'M3', 'M4', 'M5', 'M6')):
            df_copy[column] = pd.to_datetime(df_copy[column], errors='coerce')
        if column.startswith('Prox'):
            df_copy[column] = pd.to_datetime(df_copy[column], errors='coerce')

    df_dates = df_copy.select_dtypes(include=['datetime64[ns]'])
    for col in df_dates.columns:
        mask_zeros = df_copy[col] == 0
        df_copy[col] = (df_copy[col] - fecha_base).dt.days
        df_copy.loc[mask_zeros, col] = 0

    # Especial: Fecha_prox_corte_M1
    if 'Fecha_prox_corte_M1' in df_copy.columns:
        mask_zeros_2 = df_copy['Fecha_prox_corte_M1'] == 0
        df_copy['Fecha_prox_corte_M1'] = pd.to_datetime(df_copy['Fecha_prox_corte_M1'], errors='coerce')
        df_copy['Fecha_prox_corte_M1'] = (df_copy['Fecha_prox_corte_M1'] - fecha_base).dt.days
        df_copy['Fecha_prox_corte_M1'] = df_copy['Fecha_prox_corte_M1'].fillna(0).astype(int)

    # No se codifican las variables categóricas aquí (eso se hará luego en el pipeline)

    return df_copy


In [7]:
limpieza_transformer = FunctionTransformer(limpieza_sin_categoricas)

class Preprocesador(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.ohe = None
        self.scaler = None
        self.categorical_cols = None
        self.numeric_cols = None

    def fit(self, X, y=None):
        self.categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
        self.numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

        self.ohe = OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False)
        self.ohe.fit(X[self.categorical_cols])

        self.scaler = StandardScaler()
        self.scaler.fit(X[self.numeric_cols])

        return self

    def transform(self, X):
        X = X.copy()

        X_cat = pd.DataFrame(self.ohe.transform(X[self.categorical_cols]),
                             columns=self.ohe.get_feature_names_out(self.categorical_cols),
                             index=X.index)

        X_num = pd.DataFrame(self.scaler.transform(X[self.numeric_cols]),
                             columns=self.numeric_cols,
                             index=X.index)

        return pd.concat([X_num, X_cat], axis=1)


In [8]:
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd

class ToNamedDataFrame(BaseEstimator, TransformerMixin):
    def __init__(self, prefix="PCA_", n_components=70):
        self.prefix = prefix
        self.n_components = n_components

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        column_names = [f"{self.prefix}{i+1}" for i in range(self.n_components)]
        return pd.DataFrame(X, columns=column_names)

In [9]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import PCA
import pandas as pd

class PCAWithTarget(BaseEstimator, TransformerMixin):
    def __init__(self, target_column='Variable_objetivo', n_components=70):
        self.target_column = target_column
        self.n_components = n_components
        self.pca = PCA(n_components=self.n_components)

    def fit(self, X, y=None):
        X_features = X.drop(columns=[self.target_column])
        self.pca.fit(X_features)
        return self

    def transform(self, X):
        X_features = X.drop(columns=[self.target_column])
        y_target = X[self.target_column].reset_index(drop=True)

        X_pca = self.pca.transform(X_features)
        df_pca = pd.DataFrame(X_pca, columns=[f'PCA_{i+1}' for i in range(self.n_components)])
        df_pca[self.target_column] = y_target.values

        return df_pca


In [10]:
pipeline1 = Pipeline([
    ("Limpieza", limpieza_transformer),
    ("Preprocesamiento", Preprocesador()),
    ('pca', PCA(n_components=70)),
    ("ToName", ToNamedDataFrame())
])

In [11]:
pipeline = Pipeline([
    ("Limpieza", limpieza_transformer),
    ("Preprocesamiento", Preprocesador()),
    ("PCAConY", PCAWithTarget(target_column='Variable_objetivo', n_components=70))
])

In [12]:
df_train = pd.concat([X_train, y_train], axis=1)
pipeline.fit(df_train)
joblib.dump(pipeline, 'fitted_pipeline.pkl')

X_train_transformed = pipeline.transform(df_train)
df_result = pd.DataFrame(X_train_transformed)
print(df_result.head())

  df_copy[column] = pd.to_datetime(df_copy[column], errors='coerce')
  df_copy[column] = pd.to_datetime(df_copy[column], errors='coerce')
  df_copy[column] = pd.to_datetime(df_copy[column], errors='coerce')
  df_copy[column] = pd.to_datetime(df_copy[column], errors='coerce')
  df_copy[column] = pd.to_datetime(df_copy[column], errors='coerce')
  df_copy[column] = pd.to_datetime(df_copy[column], errors='coerce')
  df_copy[column] = pd.to_datetime(df_copy[column], errors='coerce')
  df_copy[column] = pd.to_datetime(df_copy[column], errors='coerce')
  df_copy[column] = pd.to_datetime(df_copy[column], errors='coerce')
  df_copy[column] = pd.to_datetime(df_copy[column], errors='coerce')
  df_copy[column] = pd.to_datetime(df_copy[column], errors='coerce')
  df_copy[column] = pd.to_datetime(df_copy[column], errors='coerce')
  df_copy[column] = pd.to_datetime(df_copy[column], errors='coerce')
  df_copy[column] = pd.to_datetime(df_copy[column], errors='coerce')
  df_copy[column] = pd.to_datetime

      PCA_1     PCA_2     PCA_3     PCA_4     PCA_5     PCA_6     PCA_7  \
0 -0.270609  5.160967 -0.050946 -1.611613 -0.580238 -1.101856  0.033269   
1  2.801940  4.458315  0.117493  4.658120  8.033373 -4.619878  0.119775   
2 -2.071198 -0.127946 -0.045887 -1.712919  0.323371 -0.836732  0.091710   
3 -2.430511 -2.152678 -0.032899  1.171642 -1.122775  0.484734 -0.132185   
4 -3.166253 -5.403357 -0.047708 -1.567654 -0.508092 -0.770168  0.218788   

      PCA_8     PCA_9    PCA_10  ...    PCA_62    PCA_63    PCA_64    PCA_65  \
0 -0.001704 -0.176539 -1.618223  ...  0.020030 -0.050898  0.011333  0.032061   
1  6.572873 -1.382043 -1.789236  ...  0.013140  0.033038 -0.011097 -0.046040   
2  1.437847  0.898360 -0.570980  ...  0.014079  0.025668 -0.006790 -0.013520   
3 -0.123213 -0.826893 -0.061160  ... -0.166112  0.043259 -0.005386 -0.073139   
4  0.360301  1.353395 -0.416295  ... -0.033805  0.070221  0.003638  0.059123   

     PCA_66    PCA_67    PCA_68    PCA_69    PCA_70  Variable_objeti

In [13]:
df_test = pd.concat([X_test, y_test], axis=1)
X_test_transformed = pipeline.transform(df_test)

  df_copy[column] = pd.to_datetime(df_copy[column], errors='coerce')
  df_copy[column] = pd.to_datetime(df_copy[column], errors='coerce')
  df_copy[column] = pd.to_datetime(df_copy[column], errors='coerce')
  df_copy[column] = pd.to_datetime(df_copy[column], errors='coerce')
  df_copy[column] = pd.to_datetime(df_copy[column], errors='coerce')
  df_copy[column] = pd.to_datetime(df_copy[column], errors='coerce')
  df_copy[column] = pd.to_datetime(df_copy[column], errors='coerce')
  df_copy[column] = pd.to_datetime(df_copy[column], errors='coerce')
  df_copy[column] = pd.to_datetime(df_copy[column], errors='coerce')
  df_copy[column] = pd.to_datetime(df_copy[column], errors='coerce')
  df_copy[column] = pd.to_datetime(df_copy[column], errors='coerce')
  df_copy[column] = pd.to_datetime(df_copy[column], errors='coerce')
  df_copy[column] = pd.to_datetime(df_copy[column], errors='coerce')
  df_copy[column] = pd.to_datetime(df_copy[column], errors='coerce')
  df_copy[column] = pd.to_datetime

In [15]:
import joblib
model_imported = joblib.load('random_forest_model.pkl')
prediction = model_imported.predict(X_test_transformed)


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [16]:
df_xtes = pd.DataFrame(X_test_transformed)
prediction = model_imported.predict(np.array(df_xtes.iloc[0]).reshape(1,-1))
prediction



array([2], dtype=int64)

In [17]:
df_xtes.head()

Unnamed: 0,PCA_1,PCA_2,PCA_3,PCA_4,PCA_5,PCA_6,PCA_7,PCA_8,PCA_9,PCA_10,...,PCA_62,PCA_63,PCA_64,PCA_65,PCA_66,PCA_67,PCA_68,PCA_69,PCA_70,Variable_objetivo
0,-2.297062,-1.729995,0.045939,2.039859,0.536415,2.92361,-0.197543,0.775761,-1.202269,0.64578,...,0.041651,-0.098918,-0.014702,-0.036471,0.001011,-0.111173,0.008986,0.004185,0.065576,-0.686946
1,1.706867,4.780855,-0.047429,-0.584346,-1.148214,1.145915,0.043938,0.485856,1.496148,0.180953,...,-0.090484,-0.107445,0.01466,-0.033938,-0.132079,-0.009257,-0.01344,-0.002312,-0.119638,-0.686946
2,8.48461,-1.857176,-0.15457,-5.138718,-1.930209,-1.800175,0.49678,-0.471903,0.07099,0.770315,...,-0.00719,0.004418,-0.040653,-0.02298,0.057518,-0.037666,-0.026373,0.0028,-0.011358,1.455719
3,3.437531,1.443703,0.025789,1.880673,-0.085045,2.970523,-0.127059,-0.432163,1.394117,0.42961,...,0.26478,-0.301058,0.009189,-0.010021,-0.207674,-0.183845,0.010838,-0.011213,0.031516,-0.686946
4,6.765034,3.681444,-0.117264,-0.210611,-2.100701,0.071188,0.148761,-0.784888,-0.051146,-0.491475,...,-0.004046,0.016848,-0.002772,-0.014591,0.041828,-0.021092,-0.009631,0.006833,-0.248397,-0.686946
