In [1]:
import warnings
warnings.filterwarnings("ignore")

# imports best practice pandas
import os

import numpy as np
import pandas as pd
import missingno as msno
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="white", palette=None)
%matplotlib inline
import math
import pickle
import joblib
import dill
import gzip
import inspect

#--------------------------------------------------------

# imports best practice sklearn
import sklearn
from sklearn.feature_selection import VarianceThreshold
from sklearn import set_config

# preprocessing
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, LabelEncoder, OneHotEncoder, RobustScaler, MinMaxScaler
from scipy import stats
from imblearn.over_sampling import RandomOverSampler

# transformers
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import FunctionTransformer

# evaluacion
from sklearn.metrics import mean_absolute_error, r2_score, roc_auc_score, roc_curve, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import metrics

# pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
set_config(transform_output = "pandas")

# models
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV

# interpretabilidad
import shap

### Importamos el dataset y revisamos las métricas principales

In [2]:
DATA_PATH = "../../data/processed/"
FILE_NAME = "df_capstone_newFeatures.pkl.gz"
PICKLE_PATH = DATA_PATH+FILE_NAME

with gzip.open(PICKLE_PATH, 'rb') as gz_csv_df:

    df_capstone_clean = pd.read_pickle(gz_csv_df)

print('Dataframe cargado exitosamente')

Dataframe cargado exitosamente


In [3]:
DATA_PATH = "../../data/processed/"
FILE_NAME = "df_cluestering_WITH_CLUSTERS.pkl.gz"
PICKLE_PATH = DATA_PATH+FILE_NAME

with gzip.open(PICKLE_PATH, 'rb') as df_load:
    df_capstone_clean_with_clusters = pd.read_pickle(df_load)
    
print('Dataframe cragado exitosamente')

Dataframe cragado exitosamente


In [4]:
# mergeamos las columnas que necesitamos del df original con todas las variables

df_capstone_clean_with_clusters = pd.merge(left = df_capstone_clean_with_clusters,
                                            right = df_capstone_clean[['pk_cid', 'pk_partition','active_customer', 'entry_date',
                                                                'regions_ca_id', 'nr_account_10_trend', 'nr_invest_savings_40_trend', 
                                                                'nr_financing_60_trend', 'salary', 'product_mix_diversity']],
                                            how = 'left',
                                            left_on = ['pk_cid', 'pk_partition'],
                                            right_on = ['pk_cid', 'pk_partition'])

In [5]:
# creamos las columnas faltantes

df_capstone_clean_with_clusters['increased_nr_account_10'] = (df_capstone_clean_with_clusters['nr_account_10_trend']>0).astype('int')
df_capstone_clean_with_clusters['increased_nr_invest_savings_40'] = (df_capstone_clean_with_clusters['nr_invest_savings_40_trend']>0).astype('int')
df_capstone_clean_with_clusters['increased_nr_financing_60'] = (df_capstone_clean_with_clusters['nr_financing_60_trend']>0).astype('int')
                                   
df_capstone_clean_with_clusters['increased_nr_accounts_10_m+1'] = df_capstone_clean_with_clusters.sort_values(['pk_partition','pk_cid']).groupby('pk_cid')['increased_nr_account_10'].transform(lambda x : x.shift(-1))
df_capstone_clean_with_clusters['increased_nr_invest_savings_40_m+1'] = df_capstone_clean_with_clusters.sort_values(['pk_partition','pk_cid']).groupby('pk_cid')['increased_nr_invest_savings_40'].transform(lambda x : x.shift(-1))
df_capstone_clean_with_clusters['increased_nr_financing_60_m+1'] = df_capstone_clean_with_clusters.sort_values(['pk_partition','pk_cid']).groupby('pk_cid')['increased_nr_financing_60'].transform(lambda x : x.shift(-1))

df_capstone_clean_with_clusters.insert(3,'increased_nr_accounts_10_m+1',df_capstone_clean_with_clusters.pop('increased_nr_accounts_10_m+1'))
df_capstone_clean_with_clusters.insert(4,'increased_nr_invest_savings_40_m+1',df_capstone_clean_with_clusters.pop('increased_nr_invest_savings_40_m+1'))
df_capstone_clean_with_clusters.insert(5,'increased_nr_financing_60_m+1',df_capstone_clean_with_clusters.pop('increased_nr_financing_60_m+1'))

In [6]:
del df_capstone_clean

In [7]:
df_capstone_clean_with_clusters['pk_partition'].unique()

<DatetimeArray>
['2018-07-28 00:00:00', '2018-08-28 00:00:00', '2018-11-28 00:00:00',
 '2018-12-28 00:00:00', '2019-01-28 00:00:00', '2019-02-28 00:00:00',
 '2019-03-28 00:00:00', '2019-04-28 00:00:00', '2019-05-28 00:00:00',
 '2018-09-28 00:00:00', '2018-10-28 00:00:00', '2018-01-28 00:00:00',
 '2018-02-28 00:00:00', '2018-03-28 00:00:00', '2018-04-28 00:00:00',
 '2018-05-28 00:00:00', '2018-06-28 00:00:00']
Length: 17, dtype: datetime64[ns]

In [8]:
df_capstone_clean_with_clusters_last_partition = df_capstone_clean_with_clusters[df_capstone_clean_with_clusters['pk_partition'] == '2019-05-28']

In [9]:
def report_df(df, verbose = True):
    '''
    Hace un report simple sobre el DataFrame suministrado.
    '''
    print(df.info(verbose = verbose, show_counts=True))
    total_nulos = df.isnull().sum().sum()
    print()
    print(f"Tenemos un total de {total_nulos} nulos")

In [10]:
report_df(df_capstone_clean_with_clusters)

df_capstone_clean_with_clusters.head().T

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5962924 entries, 0 to 5962923
Data columns (total 35 columns):
 #   Column                              Non-Null Count    Dtype         
---  ------                              --------------    -----         
 0   pk_cid                              5962924 non-null  int64         
 1   pk_partition                        5962924 non-null  datetime64[ns]
 2   segment                             5962924 non-null  object        
 3   increased_nr_accounts_10_m+1        5506551 non-null  float64       
 4   increased_nr_invest_savings_40_m+1  5506551 non-null  float64       
 5   increased_nr_financing_60_m+1       5506551 non-null  float64       
 6   age                                 5962924 non-null  int64         
 7   financial_health_score              5962924 non-null  float64       
 8   total_profit_customer               5962924 non-null  float64       
 9   months_from_last_purchase           5962924 non-null  int64         

Unnamed: 0,0,1,2,3,4
pk_cid,15891,15891,16063,16063,16063
pk_partition,2018-07-28 00:00:00,2018-08-28 00:00:00,2018-11-28 00:00:00,2018-12-28 00:00:00,2019-01-28 00:00:00
segment,02 - PARTICULARES,02 - PARTICULARES,02 - PARTICULARES,02 - PARTICULARES,02 - PARTICULARES
increased_nr_accounts_10_m+1,0.0,,0.0,0.0,0.0
increased_nr_invest_savings_40_m+1,0.0,,0.0,0.0,0.0
increased_nr_financing_60_m+1,0.0,,0.0,0.0,0.0
age,59,59,62,62,62
financial_health_score,0.001671,0.001671,0.001765,0.001765,0.001765
total_profit_customer,0.0,0.0,0.0,0.0,0.0
months_from_last_purchase,0,1,0,1,2


### Cargamos las funciones e importamos el pipeline

In [11]:
from functions.functions_for_classifier import pk_partition_month_extraction, convert_entry_date_to_numeric

In [12]:
source = inspect.getsource(pk_partition_month_extraction)

print(source)

def pk_partition_month_extraction(df):
    
    # df['pk_partition_month'] = df['pk_partition'].dt.month_name()
    df.drop('pk_partition', axis='columns', inplace=True)
    
    return df



In [13]:
source = inspect.getsource(convert_entry_date_to_numeric)

print(source)

def convert_entry_date_to_numeric(df):
    
    df['entry_date'] = df['entry_date'].apply(lambda x: x.timestamp()).astype('int')
    
    return df



In [14]:
CWD = os.getcwd()
DATA_PATH = os.path.join(CWD, 'classifier_propension')
PIPE_PATH_ACCOUNTS = os.path.join(DATA_PATH, 'pipeline_para_classifier_propension_con_cluster_accounts.pkl')
PIPE_PATH_SAVING_AND_INVESTMENT = os.path.join(DATA_PATH, 'pipeline_para_classifier_propension_con_cluster_saving_and_investment.pkl')
PIPE_PATH_FINANCING = os.path.join(DATA_PATH, 'pipeline_para_classifier_propension_con_cluster_financing.pkl')

pipe_accounts = pickle.load(open(PIPE_PATH_ACCOUNTS, 'rb'))
pipe_saving_and_investment = pickle.load(open(PIPE_PATH_SAVING_AND_INVESTMENT, 'rb'))
pipe_financing = pickle.load(open(PIPE_PATH_FINANCING, 'rb'))

In [15]:
pipe_accounts


In [16]:
pipe_saving_and_investment

In [17]:
pipe_financing

### Cargamos el modelo

In [18]:
CWD = os.getcwd()
DATA_PATH = os.path.join(CWD, 'classifier_propension')
MODEL_PATH_ACCOUNTS = os.path.join(DATA_PATH, 'classifier_propension_con_cluster_accounts.pkl')
MODEL_PATH_SAVING_AND_INVESTMENT = os.path.join(DATA_PATH, 'classifier_propension_con_cluster_saving_and_investment.pkl')
MODEL_PATH_FINANCING = os.path.join(DATA_PATH, 'classifier_propension_con_cluster_financing.pkl')

model_accounts = pickle.load(open(MODEL_PATH_ACCOUNTS, 'rb'))
model_saving_and_investment = pickle.load(open(MODEL_PATH_SAVING_AND_INVESTMENT, 'rb'))
model_financing = pickle.load(open(MODEL_PATH_FINANCING, 'rb'))

In [19]:
model_accounts

In [20]:
model_saving_and_investment

In [21]:
model_financing

### Hacemos el predict

#### Accounts

In [121]:
df_capstone_clean_with_clusters_last_partition_account_predictions = df_capstone_clean_with_clusters_last_partition.copy()

df_capstone_clean_with_clusters_last_partition_account_predictions = pipe_accounts.transform(df_capstone_clean_with_clusters_last_partition_account_predictions)

In [122]:
df_capstone_clean_with_clusters_last_partition_account_predictions.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Index: 442995 entries, 8 to 5962923
Data columns (total 46 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   regions_ca_id_AN              442995 non-null  int8   
 1   regions_ca_id_AR              442995 non-null  int8   
 2   regions_ca_id_AS              442995 non-null  int8   
 3   regions_ca_id_CB              442995 non-null  int8   
 4   regions_ca_id_CE              442995 non-null  int8   
 5   regions_ca_id_CL              442995 non-null  int8   
 6   regions_ca_id_CM              442995 non-null  int8   
 7   regions_ca_id_CN              442995 non-null  int8   
 8   regions_ca_id_CT              442995 non-null  int8   
 9   regions_ca_id_EX              442995 non-null  int8   
 10  regions_ca_id_Extranjero      442995 non-null  int8   
 11  regions_ca_id_GA              442995 non-null  int8   
 12  regions_ca_id_IB              442995 non-null  i

In [123]:
# seteamos el pk_cid como índice

df_capstone_clean_with_clusters_last_partition_account_predictions.set_index('pk_cid', inplace=True)

In [124]:
X_pred_accounts = df_capstone_clean_with_clusters_last_partition_account_predictions.copy()

In [125]:
print(model_accounts.feature_names_in_)

['regions_ca_id_AN' 'regions_ca_id_AR' 'regions_ca_id_AS'
 'regions_ca_id_CB' 'regions_ca_id_CE' 'regions_ca_id_CL'
 'regions_ca_id_CM' 'regions_ca_id_CN' 'regions_ca_id_CT'
 'regions_ca_id_EX' 'regions_ca_id_Extranjero' 'regions_ca_id_GA'
 'regions_ca_id_IB' 'regions_ca_id_MC' 'regions_ca_id_MD'
 'regions_ca_id_ML' 'regions_ca_id_NC' 'regions_ca_id_PV'
 'regions_ca_id_RI' 'regions_ca_id_VC' 'cluster_-1.0' 'cluster_0.0'
 'cluster_1.0' 'cluster_2.0' 'cluster_3.0' 'cluster_4.0' 'cluster_5.0'
 'cluster_6.0' 'segment' 'age' 'months_from_last_purchase'
 'nr_financing_60' 'nr_account_10' 'nr_debt' 'nr_product_trend'
 'customer_tenure' 'nr_transactions' 'saving_behavior'
 'risk_index_absolute' 'active_customer_rate' 'entry_date'
 'nr_account_10_trend' 'salary' 'increased_nr_account_10']


In [126]:
features_pred = list(X_pred_accounts.columns)
features_model = list(model_accounts.feature_names_in_)

print('Columnas en dataset:',len(features_pred))
print('Variables en modelos:',len(features_model))
print('¿Match?:', features_model == features_pred)

Columnas en dataset: 45
Variables en modelos: 44
¿Match?: False


In [127]:
missing_features = [i for i in features_model if i not in features_pred]
exceeding_features = [i for i in features_pred if i not in features_model]
print('Variables que faltan en el dataset y hay que crear:\n', missing_features)
print('')
print('Variables que sobran en el dataset y hay que borrar:\n', exceeding_features)

Variables que faltan en el dataset y hay que crear:
 []

Variables que sobran en el dataset y hay que borrar:
 ['increased_nr_accounts_10_m+1']


In [128]:
for col in missing_features:
  X_pred_accounts[col]= 0
  
X_pred_accounts.drop(columns = exceeding_features, inplace=True)

In [129]:
features_test = list(X_pred_accounts.columns)

print('Columnas en dataset:',len(features_test))
print('Variables en modelos:',len(features_model))
print('¿Match?:', features_model == features_test)

Columnas en dataset: 44
Variables en modelos: 44
¿Match?: True


In [130]:
# Reordena variables

X_pred_accounts = X_pred_accounts[features_model]

In [131]:
# creamos las predicciones

predictions_accounts = model_accounts.predict(X_pred_accounts)

In [132]:
# creamos las probabilidades

predictions_accounts_proba = model_accounts.predict_proba(X_pred_accounts)

In [133]:
df_capstone_clean_with_clusters_last_partition_propension = df_capstone_clean_with_clusters_last_partition[['pk_cid']]

In [134]:
# ponemos las predicciones en un dataset nuevo

df_capstone_clean_with_clusters_last_partition_propension = df_capstone_clean_with_clusters_last_partition[['pk_cid']]

df_capstone_clean_with_clusters_last_partition_propension['increased_nr_accounts_10_m+1'] = predictions_accounts
df_capstone_clean_with_clusters_last_partition_propension['increased_nr_accounts_10_m+1_proba'] = predictions_accounts_proba[:,1]

df_capstone_clean_with_clusters_last_partition_propension['ROI_nr_accounts_10_m+1'] = df_capstone_clean_with_clusters_last_partition_propension['increased_nr_accounts_10_m+1'] * \
                                                                                      df_capstone_clean_with_clusters_last_partition_propension['increased_nr_accounts_10_m+1_proba'] * 10

df_capstone_clean_with_clusters_last_partition_propension.head()

Unnamed: 0,pk_cid,increased_nr_accounts_10_m+1,increased_nr_accounts_10_m+1_proba,ROI_nr_accounts_10_m+1
8,16063,0,0.045194,0.0
14,16203,1,0.951505,9.51505
23,16502,0,0.073489,0.0
40,17457,1,0.885266,8.852658
51,17590,0,0.401437,0.0


#### Saving and investment

In [135]:
df_capstone_clean_with_clusters_last_partition_saving_and_investment_predictions = df_capstone_clean_with_clusters_last_partition.copy()

df_capstone_clean_with_clusters_last_partition_saving_and_investment_predictions = pipe_saving_and_investment.transform(df_capstone_clean_with_clusters_last_partition_saving_and_investment_predictions)

In [136]:
df_capstone_clean_with_clusters_last_partition_saving_and_investment_predictions.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Index: 442995 entries, 8 to 5962923
Data columns (total 45 columns):
 #   Column                              Non-Null Count   Dtype  
---  ------                              --------------   -----  
 0   regions_ca_id_AN                    442995 non-null  int8   
 1   regions_ca_id_AR                    442995 non-null  int8   
 2   regions_ca_id_AS                    442995 non-null  int8   
 3   regions_ca_id_CB                    442995 non-null  int8   
 4   regions_ca_id_CE                    442995 non-null  int8   
 5   regions_ca_id_CL                    442995 non-null  int8   
 6   regions_ca_id_CM                    442995 non-null  int8   
 7   regions_ca_id_CN                    442995 non-null  int8   
 8   regions_ca_id_CT                    442995 non-null  int8   
 9   regions_ca_id_EX                    442995 non-null  int8   
 10  regions_ca_id_Extranjero            442995 non-null  int8   
 11  regions_ca_id_GA              

In [137]:
# seteamos el pk_cid como índice

df_capstone_clean_with_clusters_last_partition_saving_and_investment_predictions.set_index('pk_cid', inplace=True)

In [138]:
X_pred_saving_and_investment = df_capstone_clean_with_clusters_last_partition_saving_and_investment_predictions.copy()

In [139]:
print(model_saving_and_investment.feature_names_in_)

['regions_ca_id_AN' 'regions_ca_id_AR' 'regions_ca_id_AS'
 'regions_ca_id_CB' 'regions_ca_id_CE' 'regions_ca_id_CL'
 'regions_ca_id_CM' 'regions_ca_id_CN' 'regions_ca_id_CT'
 'regions_ca_id_EX' 'regions_ca_id_Extranjero' 'regions_ca_id_GA'
 'regions_ca_id_IB' 'regions_ca_id_MC' 'regions_ca_id_MD'
 'regions_ca_id_ML' 'regions_ca_id_NC' 'regions_ca_id_PV'
 'regions_ca_id_RI' 'regions_ca_id_VC' 'cluster_-1.0' 'cluster_0.0'
 'cluster_1.0' 'cluster_2.0' 'cluster_3.0' 'cluster_4.0' 'cluster_5.0'
 'cluster_6.0' 'segment' 'age' 'months_from_last_purchase'
 'nr_financing_60' 'nr_account_10' 'nr_debt' 'nr_product_trend'
 'customer_tenure' 'nr_transactions' 'saving_behavior'
 'risk_index_absolute' 'active_customer' 'entry_date'
 'nr_invest_savings_40_trend' 'salary']


In [140]:
features_pred = list(X_pred_saving_and_investment.columns)
features_model = list(model_saving_and_investment.feature_names_in_)

print('Columnas en dataset:',len(features_pred))
print('Variables en modelos:',len(features_model))
print('¿Match?:', features_model == features_pred)

Columnas en dataset: 44
Variables en modelos: 43
¿Match?: False


In [141]:
missing_features = [i for i in features_model if i not in features_pred]
exceeding_features = [i for i in features_pred if i not in features_model]
print('Variables que faltan en el dataset y hay que crear:\n', missing_features)
print('')
print('Variables que sobran en el dataset y hay que borrar:\n', exceeding_features)

Variables que faltan en el dataset y hay que crear:
 []

Variables que sobran en el dataset y hay que borrar:
 ['increased_nr_invest_savings_40_m+1']


In [142]:
for col in missing_features:
  X_pred_saving_and_investment[col]= 0
  
X_pred_saving_and_investment.drop(columns = exceeding_features, inplace=True)

In [143]:
features_test = list(X_pred_saving_and_investment.columns)

print('Columnas en dataset:',len(features_test))
print('Variables en modelos:',len(features_model))
print('¿Match?:', features_model == features_test)

Columnas en dataset: 43
Variables en modelos: 43
¿Match?: True


In [144]:
# Reordena variables

X_pred_saving_and_investment = X_pred_saving_and_investment[features_model]

In [145]:
# creamos las predicciones

predictions_saving_and_investments = model_saving_and_investment.predict(X_pred_saving_and_investment)

In [146]:
# creamos las probabilidades

predictions_saving_and_investments_proba = model_saving_and_investment.predict_proba(X_pred_saving_and_investment)

In [147]:
# ponemos las predicciones en un dataset nuevo

# df_capstone_clean_with_clusters_last_partition_propension = df_capstone_clean_with_clusters_last_partition[['pk_cid']]

df_capstone_clean_with_clusters_last_partition_propension['increased_nr_invest_savings_40_m+1'] = predictions_saving_and_investments
df_capstone_clean_with_clusters_last_partition_propension['increased_nr_invest_savings_40_m+1_proba'] = predictions_saving_and_investments_proba[:,1]

df_capstone_clean_with_clusters_last_partition_propension['ROI_nr_invest_savings_40_m+1'] = df_capstone_clean_with_clusters_last_partition_propension['increased_nr_invest_savings_40_m+1'] * \
                                                                                            df_capstone_clean_with_clusters_last_partition_propension['increased_nr_invest_savings_40_m+1_proba'] * 40
                                                                                      
df_capstone_clean_with_clusters_last_partition_propension.head()

Unnamed: 0,pk_cid,increased_nr_accounts_10_m+1,increased_nr_accounts_10_m+1_proba,ROI_nr_accounts_10_m+1,increased_nr_invest_savings_40_m+1,increased_nr_invest_savings_40_m+1_proba,ROI_nr_invest_savings_40_m+1
8,16063,0,0.045194,0.0,0,0.000216,0.0
14,16203,1,0.951505,9.51505,0,0.008044,0.0
23,16502,0,0.073489,0.0,0,0.07192,0.0
40,17457,1,0.885266,8.852658,1,0.52393,20.957201
51,17590,0,0.401437,0.0,0,0.027835,0.0


#### Financing

In [148]:
df_capstone_clean_with_clusters_last_partition_financing_predictions = df_capstone_clean_with_clusters_last_partition.copy()

df_capstone_clean_with_clusters_last_partition_financing_predictions = pipe_financing.transform(df_capstone_clean_with_clusters_last_partition_financing_predictions)

In [149]:
df_capstone_clean_with_clusters_last_partition_financing_predictions.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Index: 442995 entries, 8 to 5962923
Data columns (total 45 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   regions_ca_id_AN               442995 non-null  int8   
 1   regions_ca_id_AR               442995 non-null  int8   
 2   regions_ca_id_AS               442995 non-null  int8   
 3   regions_ca_id_CB               442995 non-null  int8   
 4   regions_ca_id_CE               442995 non-null  int8   
 5   regions_ca_id_CL               442995 non-null  int8   
 6   regions_ca_id_CM               442995 non-null  int8   
 7   regions_ca_id_CN               442995 non-null  int8   
 8   regions_ca_id_CT               442995 non-null  int8   
 9   regions_ca_id_EX               442995 non-null  int8   
 10  regions_ca_id_Extranjero       442995 non-null  int8   
 11  regions_ca_id_GA               442995 non-null  int8   
 12  regions_ca_id_IB               442

In [150]:
# seteamos el pk_cid como índice

df_capstone_clean_with_clusters_last_partition_financing_predictions.set_index('pk_cid', inplace=True)

In [151]:
X_pred_financing = df_capstone_clean_with_clusters_last_partition_financing_predictions.copy()

In [152]:
print(model_financing.feature_names_in_)

['regions_ca_id_AN' 'regions_ca_id_AR' 'regions_ca_id_AS'
 'regions_ca_id_CB' 'regions_ca_id_CE' 'regions_ca_id_CL'
 'regions_ca_id_CM' 'regions_ca_id_CN' 'regions_ca_id_CT'
 'regions_ca_id_EX' 'regions_ca_id_Extranjero' 'regions_ca_id_GA'
 'regions_ca_id_IB' 'regions_ca_id_MC' 'regions_ca_id_MD'
 'regions_ca_id_ML' 'regions_ca_id_NC' 'regions_ca_id_PV'
 'regions_ca_id_RI' 'regions_ca_id_VC' 'cluster_-1.0' 'cluster_0.0'
 'cluster_1.0' 'cluster_2.0' 'cluster_3.0' 'cluster_4.0' 'cluster_5.0'
 'cluster_6.0' 'segment' 'age' 'months_from_last_purchase'
 'nr_financing_60' 'nr_account_10' 'nr_product_trend' 'customer_tenure'
 'nr_transactions' 'saving_behavior' 'risk_index_absolute'
 'active_customer' 'entry_date' 'nr_financing_60_trend' 'salary'
 'increased_nr_financing_60']


In [153]:
features_pred = list(X_pred_financing.columns)
features_model = list(model_financing.feature_names_in_)

print('Columnas en dataset:',len(features_pred))
print('Variables en modelos:',len(features_model))
print('¿Match?:', features_model == features_pred)

Columnas en dataset: 44
Variables en modelos: 43
¿Match?: False


In [154]:
missing_features = [i for i in features_model if i not in features_pred]
exceeding_features = [i for i in features_pred if i not in features_model]
print('Variables que faltan en el dataset y hay que crear:\n', missing_features)
print('')
print('Variables que sobran en el dataset y hay que borrar:\n', exceeding_features)

Variables que faltan en el dataset y hay que crear:
 []

Variables que sobran en el dataset y hay que borrar:
 ['increased_nr_financing_60_m+1']


In [155]:
for col in missing_features:
  X_pred_financing[col]= 0
  
X_pred_financing.drop(columns = exceeding_features, inplace=True)

In [156]:
features_test = list(X_pred_financing.columns)

print('Columnas en dataset:',len(features_test))
print('Variables en modelos:',len(features_model))
print('¿Match?:', features_model == features_test)

Columnas en dataset: 43
Variables en modelos: 43
¿Match?: True


In [157]:
# Reordena variables

X_pred_financing = X_pred_financing[features_model]

In [158]:
# creamos las predicciones

predictions_financing = model_financing.predict(X_pred_financing)

In [159]:
# creamos las probabilidades

predictions_financing_proba = model_financing.predict_proba(X_pred_financing)

In [160]:
# ponemos las predicciones en un dataset nuevo

# df_capstone_clean_with_clusters_last_partition_propension = df_capstone_clean_with_clusters_last_partition[['pk_cid']]

df_capstone_clean_with_clusters_last_partition_propension['increased_nr_financing_60_m+1'] = predictions_financing
df_capstone_clean_with_clusters_last_partition_propension['increased_nr_financing_60_m+1_proba'] = predictions_financing_proba[:,1]

df_capstone_clean_with_clusters_last_partition_propension['ROI_nr_financing_60_m+1'] = df_capstone_clean_with_clusters_last_partition_propension['increased_nr_financing_60_m+1'] * \
                                                                                            df_capstone_clean_with_clusters_last_partition_propension['increased_nr_financing_60_m+1_proba'] * 60
                                                                                      
df_capstone_clean_with_clusters_last_partition_propension.reset_index(drop=True, inplace=True)
df_capstone_clean_with_clusters_last_partition_propension.head()

Unnamed: 0,pk_cid,increased_nr_accounts_10_m+1,increased_nr_accounts_10_m+1_proba,ROI_nr_accounts_10_m+1,increased_nr_invest_savings_40_m+1,increased_nr_invest_savings_40_m+1_proba,ROI_nr_invest_savings_40_m+1,increased_nr_financing_60_m+1,increased_nr_financing_60_m+1_proba,ROI_nr_financing_60_m+1
0,16063,0,0.045194,0.0,0,0.000216,0.0,0,0.00232,0.0
1,16203,1,0.951505,9.51505,0,0.008044,0.0,0,0.142948,0.0
2,16502,0,0.073489,0.0,0,0.07192,0.0,0,0.299887,0.0
3,17457,1,0.885266,8.852658,1,0.52393,20.957201,0,0.380822,0.0
4,17590,0,0.401437,0.0,0,0.027835,0.0,0,0.09248,0.0


### Analisis de las predicciones

In [161]:
df_capstone_clean_with_clusters_last_partition_propension.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 442995 entries, 0 to 442994
Data columns (total 10 columns):
 #   Column                                    Non-Null Count   Dtype  
---  ------                                    --------------   -----  
 0   pk_cid                                    442995 non-null  int64  
 1   increased_nr_accounts_10_m+1              442995 non-null  int32  
 2   increased_nr_accounts_10_m+1_proba        442995 non-null  float32
 3   ROI_nr_accounts_10_m+1                    442995 non-null  float64
 4   increased_nr_invest_savings_40_m+1        442995 non-null  int32  
 5   increased_nr_invest_savings_40_m+1_proba  442995 non-null  float32
 6   ROI_nr_invest_savings_40_m+1              442995 non-null  float64
 7   increased_nr_financing_60_m+1             442995 non-null  int32  
 8   increased_nr_financing_60_m+1_proba       442995 non-null  float32
 9   ROI_nr_financing_60_m+1                   442995 non-null  float64
dtypes: float32(3), float

In [162]:
df_capstone_clean_with_clusters_last_partition_propension.head().T

Unnamed: 0,0,1,2,3,4
pk_cid,16063.0,16203.0,16502.0,17457.0,17590.0
increased_nr_accounts_10_m+1,0.0,1.0,0.0,1.0,0.0
increased_nr_accounts_10_m+1_proba,0.045194,0.951505,0.073489,0.885266,0.401437
ROI_nr_accounts_10_m+1,0.0,9.51505,0.0,8.852658,0.0
increased_nr_invest_savings_40_m+1,0.0,0.0,0.0,1.0,0.0
increased_nr_invest_savings_40_m+1_proba,0.000216,0.008044,0.07192,0.52393,0.027835
ROI_nr_invest_savings_40_m+1,0.0,0.0,0.0,20.957201,0.0
increased_nr_financing_60_m+1,0.0,0.0,0.0,0.0,0.0
increased_nr_financing_60_m+1_proba,0.00232,0.142948,0.299887,0.380822,0.09248
ROI_nr_financing_60_m+1,0.0,0.0,0.0,0.0,0.0


In [164]:
print(df_capstone_clean_with_clusters_last_partition_propension['increased_nr_accounts_10_m+1'].value_counts(dropna=False))
print('')
print('')
print(df_capstone_clean_with_clusters_last_partition_propension['increased_nr_invest_savings_40_m+1'].value_counts(dropna=False))
print('')
print('')
print(df_capstone_clean_with_clusters_last_partition_propension['increased_nr_financing_60_m+1'].value_counts(dropna=False))

increased_nr_accounts_10_m+1
0    398965
1     44030
Name: count, dtype: int64


increased_nr_invest_savings_40_m+1
0    421857
1     21138
Name: count, dtype: int64


increased_nr_financing_60_m+1
0    407670
1     35325
Name: count, dtype: int64


In [172]:
print(df_capstone_clean_with_clusters_last_partition_propension.groupby(['increased_nr_accounts_10_m+1'])['increased_nr_accounts_10_m+1_proba'].mean())
print('')
print('')
print(df_capstone_clean_with_clusters_last_partition_propension.groupby(['increased_nr_invest_savings_40_m+1'])['increased_nr_invest_savings_40_m+1_proba'].mean())
print('')
print('')
print(df_capstone_clean_with_clusters_last_partition_propension.groupby(['increased_nr_financing_60_m+1'])['increased_nr_financing_60_m+1_proba'].mean())

increased_nr_accounts_10_m+1
0    0.100744
1    0.707188
Name: increased_nr_accounts_10_m+1_proba, dtype: float32


increased_nr_invest_savings_40_m+1
0    0.034293
1    0.679841
Name: increased_nr_invest_savings_40_m+1_proba, dtype: float32


increased_nr_financing_60_m+1
0    0.038294
1    0.754994
Name: increased_nr_financing_60_m+1_proba, dtype: float32


In [176]:
pd.set_option('display.float_format', lambda x: '%.3f' % x) # para evitar la notación cientifica

print(df_capstone_clean_with_clusters_last_partition_propension.groupby(['increased_nr_accounts_10_m+1'])['ROI_nr_accounts_10_m+1'].sum())
print('')
print('')
print(df_capstone_clean_with_clusters_last_partition_propension.groupby(['increased_nr_invest_savings_40_m+1'])['ROI_nr_invest_savings_40_m+1'].sum())
print('')
print('')
print(df_capstone_clean_with_clusters_last_partition_propension.groupby(['increased_nr_financing_60_m+1'])['ROI_nr_financing_60_m+1'].sum())

increased_nr_accounts_10_m+1
0        0.000
1   311374.667
Name: ROI_nr_accounts_10_m+1, dtype: float64


increased_nr_invest_savings_40_m+1
0        0.000
1   574819.029
Name: ROI_nr_invest_savings_40_m+1, dtype: float64


increased_nr_financing_60_m+1
0         0.000
1   1600209.663
Name: ROI_nr_financing_60_m+1, dtype: float64


### Exportamos las predicciones

In [None]:
DATA_PATH = "../../data/processed/"
FILE_NAME = "df_predicciones_propension_con_ROI.pkl.gz"
PICKLE_PATH = DATA_PATH+FILE_NAME

df_capstone_clean_with_clusters_last_partition_propension.to_pickle(PICKLE_PATH, compression='gzip')