# Revisión tabla customer_products.csv

In [1]:
# Importamos las librerias necesarias.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder,MinMaxScaler, OrdinalEncoder

In [2]:
# Creamos dataframe desde fichero .csv
file = 'customer_products.csv'
path = '/Users/orlando/Documents/EasyMoney_back/tablas/'
df_cust_prod = pd.read_csv(path+file, index_col=0)

In [3]:
df_cust_prod.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5962924 entries, 0 to 13647308
Data columns (total 17 columns):
 #   Column              Dtype  
---  ------              -----  
 0   pk_cid              int64  
 1   pk_partition        object 
 2   short_term_deposit  int64  
 3   loans               int64  
 4   mortgage            int64  
 5   funds               int64  
 6   securities          int64  
 7   long_term_deposit   int64  
 8   em_account_pp       int64  
 9   credit_card         int64  
 10  payroll             float64
 11  pension_plan        float64
 12  payroll_account     int64  
 13  emc_account         int64  
 14  debit_card          int64  
 15  em_account_p        int64  
 16  em_acount           int64  
dtypes: float64(2), int64(14), object(1)
memory usage: 818.9+ MB


In [4]:
df_cust_prod.head()

Unnamed: 0,pk_cid,pk_partition,short_term_deposit,loans,mortgage,funds,securities,long_term_deposit,em_account_pp,credit_card,payroll,pension_plan,payroll_account,emc_account,debit_card,em_account_p,em_acount
0,1375586,2018-01,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,1
1,1050611,2018-01,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,1
2,1050612,2018-01,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,1
3,1050613,2018-01,1,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0
4,1050614,2018-01,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,1


In [5]:
# Eliminamos la columna 'Unnamed: 0' por ser igual que el index.
# df_cust_prod.drop(columns=['Unnamed: 0'], inplace=True, errors='ignore')

In [6]:
# No hay registros duplicados.
df_cust_prod[df_cust_prod.duplicated(keep=False)]


Unnamed: 0,pk_cid,pk_partition,short_term_deposit,loans,mortgage,funds,securities,long_term_deposit,em_account_pp,credit_card,payroll,pension_plan,payroll_account,emc_account,debit_card,em_account_p,em_acount


In [7]:
# Revisamos porcentaje de valores nulos y encontramos algunos en las columnas 'payroll' y 'pension_plan'.
df_cust_prod.isna().sum()/len(df_cust_prod) * 100

pk_cid                0.000000
pk_partition          0.000000
short_term_deposit    0.000000
loans                 0.000000
mortgage              0.000000
funds                 0.000000
securities            0.000000
long_term_deposit     0.000000
em_account_pp         0.000000
credit_card           0.000000
payroll               0.001023
pension_plan          0.001023
payroll_account       0.000000
emc_account           0.000000
debit_card            0.000000
em_account_p          0.000000
em_acount             0.000000
dtype: float64

In [8]:
# Como los valores nulos corresponden a un porcentaje tan pequeño del total, podrímos optar por eliminarlos directamente.
df_cust_prod = df_cust_prod.dropna()

In [9]:
# Restablecemos los índices del dataframe tras haber eliminado filas.
df_cust_prod.reset_index(drop=True, inplace=True)

In [10]:
# Dejaremos los datos del tipo int a excepción de la columna 'pk_partition' que podríamos considerar quitarla si su información no resulta relevante.
# df_cust_prod = df_cust_prod.drop('pk_partition', axis=1)
# df_cust_prod = df_cust_prod.astype(int)

obj_col = 'pk_partition'
int_col = df_cust_prod.columns[df_cust_prod.columns != obj_col]
df_cust_prod[int_col] = df_cust_prod[int_col].astype(int)

In [11]:
df_cust_prod_clean = df_cust_prod.copy()

In [12]:
# Observamos que nadie tiene contratado el producto em_account_pp (tendencia de cuenta easyMoney++)
df_cust_prod_clean.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
pk_cid,5962863.0,1234930.0,162302.567746,15891.0,1112532.0,1231096.0,1352339.0,1553689.0
short_term_deposit,5962863.0,0.002581646,0.050744,0.0,0.0,0.0,0.0,1.0
loans,5962863.0,7.848579e-05,0.008859,0.0,0.0,0.0,0.0,1.0
mortgage,5962863.0,5.433631e-05,0.007371,0.0,0.0,0.0,0.0,1.0
funds,5962863.0,0.003370696,0.05796,0.0,0.0,0.0,0.0,1.0
securities,5962863.0,0.003711808,0.060811,0.0,0.0,0.0,0.0,1.0
long_term_deposit,5962863.0,0.01688115,0.128826,0.0,0.0,0.0,0.0,1.0
em_account_pp,5962863.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
credit_card,5962863.0,0.01188456,0.108367,0.0,0.0,0.0,0.0,1.0
payroll,5962863.0,0.03451144,0.182539,0.0,0.0,0.0,0.0,1.0


In [13]:
# Identificador de cliente.
df_cust_prod_clean['pk_cid'].value_counts()

pk_cid
1375586    17
1328802    17
1328781    17
1328782    17
1328783    17
           ..
1521045     1
1545224     1
1470941     1
1521049     1
1550586     1
Name: count, Length: 456373, dtype: int64

In [14]:
# Identificador de Partición. Mensualmente se ingesta en la tabla una partición con el estado de la base de clientes.
df_cust_prod['pk_partition'].value_counts(normalize=True)*100

pk_partition
2019-05    7.429233
2019-04    7.372750
2019-03    7.314993
2019-02    7.240264
2019-01    7.158893
2018-12    7.085204
2018-11    6.983005
2018-10    6.746759
2018-09    6.294342
2018-08    5.918667
2018-07    5.690874
2018-06    4.227902
2018-05    4.191208
2018-04    4.149902
2018-03    4.112924
2018-02    4.066956
2018-01    4.016124
Name: proportion, dtype: float64

In [15]:
# Tenencia de Short-term deposits.
df_cust_prod_clean['short_term_deposit'].value_counts(normalize=True)*100

short_term_deposit
0    99.741835
1     0.258165
Name: proportion, dtype: float64

In [16]:
# Tenencia de Préstamos.
df_cust_prod_clean['loans'].value_counts(normalize=True)*100

loans
0    99.992151
1     0.007849
Name: proportion, dtype: float64

In [17]:
# Tenencia de Mortgage (hipotecas).
df_cust_prod_clean['mortgage'].value_counts(normalize=True)*100

mortgage
0    99.994566
1     0.005434
Name: proportion, dtype: float64

In [18]:
# Tenencia de Fondos de inversión.
df_cust_prod_clean['funds'].value_counts(normalize=True)*100

funds
0    99.66293
1     0.33707
Name: proportion, dtype: float64

In [19]:
# Tenencia de Valores.
df_cust_prod_clean['securities'].value_counts(normalize=True)*100

securities
0    99.628819
1     0.371181
Name: proportion, dtype: float64

In [20]:
# Tenencia de Depósitos a largo plazo.
df_cust_prod_clean['long_term_deposit'].value_counts(normalize=True)*100

long_term_deposit
0    98.311885
1     1.688115
Name: proportion, dtype: float64

In [21]:
# Tenencia de Cuenta easyMoney++
df_cust_prod_clean['em_account_pp'].value_counts(normalize=True)*100

em_account_pp
0    100.0
Name: proportion, dtype: float64

In [22]:
# Tenencia de tarjetas de crédito
df_cust_prod_clean['credit_card'].value_counts(normalize=True)*100

credit_card
0    98.811544
1     1.188456
Name: proportion, dtype: float64

In [23]:
# Tenencia de Domiciliaciones.
df_cust_prod_clean['payroll'].value_counts(normalize=True)*100

payroll
0    96.548856
1     3.451144
Name: proportion, dtype: float64

In [24]:
# Tenencia de Plan de pensiones.
df_cust_prod_clean['pension_plan'].value_counts(normalize=True)*100

pension_plan
0    96.347359
1     3.652641
Name: proportion, dtype: float64

In [25]:
# Tenencia de Cuenta bonificada por domiciliaciones.
df_cust_prod_clean['payroll_account'].value_counts(normalize=True)*100

payroll_account
0    94.475473
1     5.524527
Name: proportion, dtype: float64

In [26]:
# Tenencia de Cuenta easyMoney Crypto.
df_cust_prod_clean['emc_account'].value_counts(normalize=True)*100

emc_account
0    94.516711
1     5.483289
Name: proportion, dtype: float64

In [27]:
# Tenencia de tarjetas de débito.
df_cust_prod_clean['debit_card'].value_counts(normalize=True)*100

debit_card
0    90.558797
1     9.441203
Name: proportion, dtype: float64

In [28]:
# Tenencia de Cuenta easyMoney+.
df_cust_prod_clean['em_account_p'].value_counts(normalize=True)*100

em_account_p
0    99.99943
1     0.00057
Name: proportion, dtype: float64

In [29]:
# Tenencia de Cuenta easyMoney.
df_cust_prod_clean['em_acount'].value_counts(normalize=True)*100

em_acount
1    73.481497
0    26.518503
Name: proportion, dtype: float64

In [30]:
# Observamos una fuerte correlación entre las variables 'payroll', 'pension_plan' y 'payroll_account'.
# Lo que nos indica una alta tendencia a domiciliar los planes de pensiones.
df_num = df_cust_prod_clean.select_dtypes(include='number')
corr = df_num.corr()
corr.style.background_gradient(cmap='coolwarm')

  smin = np.nanmin(gmap) if vmin is None else vmin
  smax = np.nanmax(gmap) if vmax is None else vmax


Unnamed: 0,pk_cid,short_term_deposit,loans,mortgage,funds,securities,long_term_deposit,em_account_pp,credit_card,payroll,pension_plan,payroll_account,emc_account,debit_card,em_account_p,em_acount
pk_cid,1.0,0.040609,-0.0046,-0.008614,-0.042348,-0.035902,-0.050703,,-0.078287,-0.030165,-0.034929,-0.041077,-0.121658,-0.042866,-0.011115,-0.062365
short_term_deposit,0.040609,1.0,-0.000451,-0.000375,0.000976,-0.000986,0.002697,,-0.002865,-0.006197,-0.006382,-0.007558,-0.007129,-0.011183,-0.000121,-0.020108
loans,-0.0046,-0.000451,1.0,-6.5e-05,0.000791,-0.000541,0.001925,,0.036063,0.016578,0.017246,0.016087,0.007595,0.020641,-2.1e-05,-0.005785
mortgage,-0.008614,-0.000375,-6.5e-05,1.0,0.006245,-0.00045,-0.000966,,0.027115,0.027149,0.026338,0.021521,0.006919,0.017773,-1.8e-05,-0.009848
funds,-0.042348,0.000976,0.000791,0.006245,1.0,0.097417,0.148276,,0.039948,0.021119,0.02335,0.026263,0.096035,0.021675,-0.000139,-0.00352
securities,-0.035902,-0.000986,-0.000541,-0.00045,0.097417,1.0,0.030855,,0.054968,0.040084,0.04117,0.040623,0.082815,0.049887,0.019488,0.001833
long_term_deposit,-0.050703,0.002697,0.001925,-0.000966,0.148276,0.030855,1.0,,0.041465,0.017623,0.020917,0.021419,0.183211,0.01387,-0.000313,-0.027843
em_account_pp,,,,,,,,,,,,,,,,
credit_card,-0.078287,-0.002865,0.036063,0.027115,0.039948,0.054968,0.041465,,1.0,0.223382,0.229744,0.234352,0.153994,0.208664,0.002979,-0.091292
payroll,-0.030165,-0.006197,0.016578,0.027149,0.021119,0.040084,0.017623,,0.223382,1.0,0.971012,0.731185,0.141737,0.353162,0.001472,-0.281506


In [31]:
df_cust_prod_clean

Unnamed: 0,pk_cid,pk_partition,short_term_deposit,loans,mortgage,funds,securities,long_term_deposit,em_account_pp,credit_card,payroll,pension_plan,payroll_account,emc_account,debit_card,em_account_p,em_acount
0,1375586,2018-01,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,1050611,2018-01,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,1050612,2018-01,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,1050613,2018-01,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1050614,2018-01,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5962858,1166765,2019-05,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
5962859,1166764,2019-05,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
5962860,1166763,2019-05,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
5962861,1166789,2019-05,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
