In [139]:
import warnings
warnings.filterwarnings("ignore")

# imports best practice pandas
import os

import numpy as np
import pandas as pd
import missingno as msno
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="white", palette=None)
%matplotlib inline
import math
import pickle
import joblib
import dill
import gzip
import inspect

#--------------------------------------------------------

# imports best practice sklearn
import sklearn
from sklearn.feature_selection import VarianceThreshold
from sklearn import set_config

# preprocessing
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, LabelEncoder, OneHotEncoder, RobustScaler, MinMaxScaler
from scipy import stats
from imblearn.over_sampling import RandomOverSampler

# transformers
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import FunctionTransformer

# evaluacion
from sklearn.metrics import mean_absolute_error, r2_score, roc_auc_score, roc_curve, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import metrics

# pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
set_config(transform_output = "pandas")

# models
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV

# interpretabilidad
import shap

In [140]:
DATA_PATH = "../../data/processed/"
FILE_NAME = "df_predicciones_propension_con_ROI.pkl.gz"
PICKLE_PATH = DATA_PATH+FILE_NAME

# Usar gzip.open para leer el contenido comprimido y luego pandas para cargar el pickle
with gzip.open(PICKLE_PATH, 'rb') as file:
    df_marketing = pd.read_pickle(file)

print('Dataframe cargado exitosamente')


Dataframe cargado exitosamente


In [141]:
df_marketing.head()

Unnamed: 0,pk_cid,increased_nr_accounts_10_m+1,increased_nr_accounts_10_m+1_proba,ROI_nr_accounts_10_m+1,increased_nr_invest_savings_40_m+1,increased_nr_invest_savings_40_m+1_proba,ROI_nr_invest_savings_40_m+1,increased_nr_financing_60_m+1,increased_nr_financing_60_m+1_proba,ROI_nr_financing_60_m+1
0,16063,0,0.045,0.0,0,0.0,0.0,0,0.002,0.0
1,16203,1,0.952,9.515,0,0.008,0.0,0,0.143,0.0
2,16502,0,0.073,0.0,0,0.072,0.0,0,0.3,0.0
3,17457,1,0.885,8.853,1,0.524,20.957,0,0.381,0.0
4,17590,0,0.401,0.0,0,0.028,0.0,0,0.092,0.0


In [142]:
df_marketing.value_counts()

pk_cid   increased_nr_accounts_10_m+1  increased_nr_accounts_10_m+1_proba  ROI_nr_accounts_10_m+1  increased_nr_invest_savings_40_m+1  increased_nr_invest_savings_40_m+1_proba  ROI_nr_invest_savings_40_m+1  increased_nr_financing_60_m+1  increased_nr_financing_60_m+1_proba  ROI_nr_financing_60_m+1
16063    0                             0.045                               0.000                   0                                   0.000                                     0.000                         0                              0.002                                0.000                      1
1368056  0                             0.170                               0.000                   0                                   0.000                                     0.000                         0                              0.003                                0.000                      1
1368067  0                             0.134                               0.000             

In [143]:
print(df_marketing['increased_nr_accounts_10_m+1'].value_counts(dropna=False))
print('')
print('')
print(df_marketing['increased_nr_invest_savings_40_m+1'].value_counts(dropna=False))
print('')
print('')
print(df_marketing['increased_nr_financing_60_m+1'].value_counts(dropna=False))

increased_nr_accounts_10_m+1
0    398965
1     44030
Name: count, dtype: int64


increased_nr_invest_savings_40_m+1
0    421857
1     21138
Name: count, dtype: int64


increased_nr_financing_60_m+1
0    407670
1     35325
Name: count, dtype: int64


In [144]:
print(df_marketing.groupby(['increased_nr_accounts_10_m+1'])['increased_nr_accounts_10_m+1_proba'].mean())
print('')
print('')
print(df_marketing.groupby(['increased_nr_invest_savings_40_m+1'])['increased_nr_invest_savings_40_m+1_proba'].mean())
print('')
print('')
print(df_marketing.groupby(['increased_nr_financing_60_m+1'])['increased_nr_financing_60_m+1_proba'].mean())

increased_nr_accounts_10_m+1
0   0.101
1   0.707
Name: increased_nr_accounts_10_m+1_proba, dtype: float32


increased_nr_invest_savings_40_m+1
0   0.034
1   0.680
Name: increased_nr_invest_savings_40_m+1_proba, dtype: float32


increased_nr_financing_60_m+1
0   0.038
1   0.755
Name: increased_nr_financing_60_m+1_proba, dtype: float32


In [145]:
pd.set_option('display.float_format', lambda x: '%.3f' % x) # para evitar la notación cientifica

print(df_marketing.groupby(['increased_nr_accounts_10_m+1'])['ROI_nr_accounts_10_m+1'].sum())
print('')
print('')
print(df_marketing.groupby(['increased_nr_invest_savings_40_m+1'])['ROI_nr_invest_savings_40_m+1'].sum())
print('')
print('')
print(df_marketing.groupby(['increased_nr_financing_60_m+1'])['ROI_nr_financing_60_m+1'].sum())

increased_nr_accounts_10_m+1
0        0.000
1   311374.667
Name: ROI_nr_accounts_10_m+1, dtype: float64


increased_nr_invest_savings_40_m+1
0        0.000
1   574819.029
Name: ROI_nr_invest_savings_40_m+1, dtype: float64


increased_nr_financing_60_m+1
0         0.000
1   1600209.663
Name: ROI_nr_financing_60_m+1, dtype: float64


In [146]:
# Crear las columnas "margen_10", "margen_40", y "margen_60" en df_marketing
df_marketing['margen_10'] = df_marketing['increased_nr_accounts_10_m+1'].apply(lambda x: 10 if x == 1 else 0)
df_marketing['margen_40'] = df_marketing['increased_nr_invest_savings_40_m+1'].apply(lambda x: 40 if x == 1 else 0)
df_marketing['margen_60'] = df_marketing['increased_nr_financing_60_m+1'].apply(lambda x: 60 if x == 1 else 0)

# Mostrar el DataFrame resultante para verificar
print(df_marketing.head())


   pk_cid  increased_nr_accounts_10_m+1  increased_nr_accounts_10_m+1_proba  \
0   16063                             0                               0.045   
1   16203                             1                               0.952   
2   16502                             0                               0.073   
3   17457                             1                               0.885   
4   17590                             0                               0.401   

   ROI_nr_accounts_10_m+1  increased_nr_invest_savings_40_m+1  \
0                   0.000                                   0   
1                   9.515                                   0   
2                   0.000                                   0   
3                   8.853                                   1   
4                   0.000                                   0   

   increased_nr_invest_savings_40_m+1_proba  ROI_nr_invest_savings_40_m+1  \
0                                     0.000              

In [147]:
df_marketing.sample(20)

Unnamed: 0,pk_cid,increased_nr_accounts_10_m+1,increased_nr_accounts_10_m+1_proba,ROI_nr_accounts_10_m+1,increased_nr_invest_savings_40_m+1,increased_nr_invest_savings_40_m+1_proba,ROI_nr_invest_savings_40_m+1,increased_nr_financing_60_m+1,increased_nr_financing_60_m+1_proba,ROI_nr_financing_60_m+1,margen_10,margen_40,margen_60
431275,1537612,0,0.223,0.0,0,0.0,0.0,0,0.006,0.0,0,0,0
177364,1219042,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0,0
113530,1144823,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0,0
350950,1442546,0,0.028,0.0,0,0.006,0.0,0,0.011,0.0,0,0,0
153539,1192275,0,0.446,0.0,0,0.044,0.0,0,0.014,0.0,0,0,0
217224,1274836,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0,0
221837,1280574,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0,0
111896,1142931,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0,0
288986,1360345,0,0.001,0.0,0,0.0,0.0,0,0.016,0.0,0,0,0
293463,1365693,0,0.412,0.0,1,0.566,22.632,0,0.234,0.0,0,40,0


In [148]:
# Crear las nuevas columnas "margen_esperado_10", "margen_esperado_40", "margen_esperado_60" en df_marketing
df_marketing['margen_esperado_10'] = df_marketing['margen_10'] * df_marketing['increased_nr_accounts_10_m+1_proba']
df_marketing['margen_esperado_40'] = df_marketing['margen_40'] * df_marketing['increased_nr_invest_savings_40_m+1_proba']
df_marketing['margen_esperado_60'] = df_marketing['margen_60'] * df_marketing['increased_nr_financing_60_m+1_proba']

# Mostrar el DataFrame resultante para verificar
print(df_marketing.head())


   pk_cid  increased_nr_accounts_10_m+1  increased_nr_accounts_10_m+1_proba  \
0   16063                             0                               0.045   
1   16203                             1                               0.952   
2   16502                             0                               0.073   
3   17457                             1                               0.885   
4   17590                             0                               0.401   

   ROI_nr_accounts_10_m+1  increased_nr_invest_savings_40_m+1  \
0                   0.000                                   0   
1                   9.515                                   0   
2                   0.000                                   0   
3                   8.853                                   1   
4                   0.000                                   0   

   increased_nr_invest_savings_40_m+1_proba  ROI_nr_invest_savings_40_m+1  \
0                                     0.000              

In [149]:
df_marketing.head()

Unnamed: 0,pk_cid,increased_nr_accounts_10_m+1,increased_nr_accounts_10_m+1_proba,ROI_nr_accounts_10_m+1,increased_nr_invest_savings_40_m+1,increased_nr_invest_savings_40_m+1_proba,ROI_nr_invest_savings_40_m+1,increased_nr_financing_60_m+1,increased_nr_financing_60_m+1_proba,ROI_nr_financing_60_m+1,margen_10,margen_40,margen_60,margen_esperado_10,margen_esperado_40,margen_esperado_60
0,16063,0,0.045,0.0,0,0.0,0.0,0,0.002,0.0,0,0,0,0.0,0.0,0.0
1,16203,1,0.952,9.515,0,0.008,0.0,0,0.143,0.0,10,0,0,9.515,0.0,0.0
2,16502,0,0.073,0.0,0,0.072,0.0,0,0.3,0.0,0,0,0,0.0,0.0,0.0
3,17457,1,0.885,8.853,1,0.524,20.957,0,0.381,0.0,10,40,0,8.853,20.957,0.0
4,17590,0,0.401,0.0,0,0.028,0.0,0,0.092,0.0,0,0,0,0.0,0.0,0.0


In [150]:
# Crear la columna 'margen_esperado_max' con el valor máximo de las tres columnas de margen esperado
df_marketing['margen_esperado_max'] = df_marketing[['margen_esperado_10', 'margen_esperado_40', 'margen_esperado_60']].max(axis=1)

# Mostrar el DataFrame resultante para verificar
print(df_marketing.head())


   pk_cid  increased_nr_accounts_10_m+1  increased_nr_accounts_10_m+1_proba  \
0   16063                             0                               0.045   
1   16203                             1                               0.952   
2   16502                             0                               0.073   
3   17457                             1                               0.885   
4   17590                             0                               0.401   

   ROI_nr_accounts_10_m+1  increased_nr_invest_savings_40_m+1  \
0                   0.000                                   0   
1                   9.515                                   0   
2                   0.000                                   0   
3                   8.853                                   1   
4                   0.000                                   0   

   increased_nr_invest_savings_40_m+1_proba  ROI_nr_invest_savings_40_m+1  \
0                                     0.000              

In [151]:
df_marketing.sample(10)

Unnamed: 0,pk_cid,increased_nr_accounts_10_m+1,increased_nr_accounts_10_m+1_proba,ROI_nr_accounts_10_m+1,increased_nr_invest_savings_40_m+1,increased_nr_invest_savings_40_m+1_proba,ROI_nr_invest_savings_40_m+1,increased_nr_financing_60_m+1,increased_nr_financing_60_m+1_proba,ROI_nr_financing_60_m+1,margen_10,margen_40,margen_60,margen_esperado_10,margen_esperado_40,margen_esperado_60,margen_esperado_max
356415,1448961,0,0.296,0.0,0,0.001,0.0,0,0.002,0.0,0,0,0,0.0,0.0,0.0,0.0
360248,1453267,0,0.018,0.0,0,0.008,0.0,0,0.075,0.0,0,0,0,0.0,0.0,0.0,0.0
26062,1031262,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0,0,0.0,0.0,0.0,0.0
179813,1221887,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0,0,0.0,0.0,0.0,0.0
171503,1212202,0,0.201,0.0,0,0.02,0.0,0,0.003,0.0,0,0,0,0.0,0.0,0.0,0.0
356129,1448637,0,0.491,0.0,0,0.12,0.0,0,0.391,0.0,0,0,0,0.0,0.0,0.0,0.0
306322,1383015,0,0.406,0.0,0,0.208,0.0,0,0.071,0.0,0,0,0,0.0,0.0,0.0,0.0
159325,1198614,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0,0,0.0,0.0,0.0,0.0
75818,1090970,0,0.337,0.0,0,0.006,0.0,0,0.008,0.0,0,0,0,0.0,0.0,0.0,0.0
78827,1095177,0,0.001,0.0,0,0.0,0.0,0,0.004,0.0,0,0,0,0.0,0.0,0.0,0.0


In [152]:
# Ordenar el DataFrame 'df_marketing' por 'margen_esperado' de mayor a menor
df_ordenado = df_marketing.sort_values('margen_esperado_max', ascending=False)

# Crear el nuevo DataFrame 'dataset_campaña_emails' con los primeros 10,000 registros
dataset_emails = df_ordenado.head(10000)

# Mostrar los primeros registros de 'dataset_campaña_emails' para verificar
print(dataset_emails.head())

         pk_cid  increased_nr_accounts_10_m+1  \
361165  1454294                             0   
313863  1394148                             0   
48903   1060165                             0   
345395  1435125                             0   
311097  1390071                             1   

        increased_nr_accounts_10_m+1_proba  ROI_nr_accounts_10_m+1  \
361165                               0.002                   0.000   
313863                               0.185                   0.000   
48903                                0.421                   0.000   
345395                               0.358                   0.000   
311097                               0.645                   6.449   

        increased_nr_invest_savings_40_m+1  \
361165                                   0   
313863                                   0   
48903                                    0   
345395                                   0   
311097                                   1   

       

In [153]:
dataset_emails.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10000 entries, 361165 to 145755
Data columns (total 17 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   pk_cid                                    10000 non-null  int64  
 1   increased_nr_accounts_10_m+1              10000 non-null  int32  
 2   increased_nr_accounts_10_m+1_proba        10000 non-null  float32
 3   ROI_nr_accounts_10_m+1                    10000 non-null  float64
 4   increased_nr_invest_savings_40_m+1        10000 non-null  int32  
 5   increased_nr_invest_savings_40_m+1_proba  10000 non-null  float32
 6   ROI_nr_invest_savings_40_m+1              10000 non-null  float64
 7   increased_nr_financing_60_m+1             10000 non-null  int32  
 8   increased_nr_financing_60_m+1_proba       10000 non-null  float32
 9   ROI_nr_financing_60_m+1                   10000 non-null  float64
 10  margen_10                        

In [154]:
dataset_emails["pk_cid"].nunique()

10000

In [None]:
DATA_PATH = "../../data/processed/"
FILE_NAME = "df_marketing_10k_MERGED_ALL_LAST_PARTITION.pkl.gz"
PICKLE_PATH = DATA_PATH+FILE_NAME

dataset_emails.to_pickle(PICKLE_PATH, compression='gzip')

In [155]:
# Asumiendo que el dataset correcto es 'dataset_campaña_emails'
# Seleccionar una muestra aleatoria de 1000 clientes
muestra_1000 = dataset_emails.sample(n=1000, random_state=1)  # random_state para reproducibilidad

In [156]:
# Función para determinar la familia de productos basada en los valores de margen esperado
def determinar_segmento_producto(fila):
    if fila['margen_esperado_10'] > fila['margen_esperado_40'] and fila['margen_esperado_10'] > fila['margen_esperado_60']:
        return "accounts"
    elif fila['margen_esperado_40'] > fila['margen_esperado_10'] and fila['margen_esperado_40'] > fila['margen_esperado_60']:
        return "savings and investment"
    elif fila['margen_esperado_60'] > fila['margen_esperado_10'] and fila['margen_esperado_60'] > fila['margen_esperado_40']:
        return "financing"
    else:
        return "unknown"  # Para los casos donde no se cumpla ninguna de las condiciones anteriores

# Aplicar la función para crear la nueva columna 'segment_product'
muestra_1000['segment_product'] = muestra_1000.apply(determinar_segmento_producto, axis=1)

# Mostrar los primeros registros para verificar
print(muestra_1000.head())

         pk_cid  increased_nr_accounts_10_m+1  \
193583  1239143                             0   
238553  1301822                             0   
168241  1208539                             0   
362780  1456165                             0   
98552   1119240                             0   

        increased_nr_accounts_10_m+1_proba  ROI_nr_accounts_10_m+1  \
193583                               0.419                   0.000   
238553                               0.133                   0.000   
168241                               0.214                   0.000   
362780                               0.020                   0.000   
98552                                0.119                   0.000   

        increased_nr_invest_savings_40_m+1  \
193583                                   1   
238553                                   0   
168241                                   0   
362780                                   0   
98552                                    0   

       

In [157]:
muestra_1000.head()

Unnamed: 0,pk_cid,increased_nr_accounts_10_m+1,increased_nr_accounts_10_m+1_proba,ROI_nr_accounts_10_m+1,increased_nr_invest_savings_40_m+1,increased_nr_invest_savings_40_m+1_proba,ROI_nr_invest_savings_40_m+1,increased_nr_financing_60_m+1,increased_nr_financing_60_m+1_proba,ROI_nr_financing_60_m+1,margen_10,margen_40,margen_60,margen_esperado_10,margen_esperado_40,margen_esperado_60,margen_esperado_max,segment_product
193583,1239143,0,0.419,0.0,1,0.529,21.161,1,0.854,51.251,0,40,60,0.0,21.161,51.251,51.251,financing
238553,1301822,0,0.133,0.0,0,0.079,0.0,1,0.937,56.198,0,0,60,0.0,0.0,56.198,56.198,financing
168241,1208539,0,0.214,0.0,0,0.235,0.0,1,0.919,55.141,0,0,60,0.0,0.0,55.141,55.141,financing
362780,1456165,0,0.02,0.0,0,0.014,0.0,1,0.936,56.161,0,0,60,0.0,0.0,56.161,56.161,financing
98552,1119240,0,0.119,0.0,0,0.367,0.0,1,0.912,54.697,0,0,60,0.0,0.0,54.697,54.697,financing


In [158]:
muestra_1000["segment_product"].value_counts()

segment_product
financing    1000
Name: count, dtype: int64

In [159]:
muestra_1000.drop('increased_nr_accounts_10_m+1', axis=1, inplace=True)
muestra_1000.drop('increased_nr_accounts_10_m+1_proba', axis=1, inplace=True)
muestra_1000.drop('ROI_nr_accounts_10_m+1', axis=1, inplace=True)
muestra_1000.drop('increased_nr_invest_savings_40_m+1', axis=1, inplace=True)
muestra_1000.drop('increased_nr_invest_savings_40_m+1_proba', axis=1, inplace=True)
muestra_1000.drop('ROI_nr_invest_savings_40_m+1', axis=1, inplace=True)
muestra_1000.drop('increased_nr_financing_60_m+1', axis=1, inplace=True)
muestra_1000.drop('increased_nr_financing_60_m+1_proba', axis=1, inplace=True)
muestra_1000.drop('ROI_nr_financing_60_m+1', axis=1, inplace=True)
muestra_1000.drop('margen_10', axis=1, inplace=True)
muestra_1000.drop('margen_40', axis=1, inplace=True)
muestra_1000.drop('margen_60', axis=1, inplace=True)
muestra_1000.drop('margen_esperado_10', axis=1, inplace=True)
muestra_1000.drop('margen_esperado_40', axis=1, inplace=True)
muestra_1000.drop('margen_esperado_60', axis=1, inplace=True)
muestra_1000.drop('margen_esperado_max', axis=1, inplace=True)

In [160]:
muestra_1000.head()

Unnamed: 0,pk_cid,segment_product
193583,1239143,financing
238553,1301822,financing
168241,1208539,financing
362780,1456165,financing
98552,1119240,financing


In [62]:
muestra_1000["pk_cid"].nunique()

1000

In [163]:
# Guardar el DataFrame 'muestra_1000' como un archivo CSV
muestra_1000.to_csv('muestra_1000_ejemplo.csv', index=False)