# Revisión tabla customer_products.csv

In [106]:
# Importamos las librerias necesarias.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder,MinMaxScaler, OrdinalEncoder

In [107]:
# Creamos dataframe desde fichero .csv
file = 'customer_products.csv'
path = '/Users/orlando/Documents/EasyMoney/tablas/'
df_cust_prod = pd.read_csv(path+file)

In [108]:
df_cust_prod.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5962924 entries, 0 to 5962923
Data columns (total 18 columns):
 #   Column              Dtype  
---  ------              -----  
 0   Unnamed: 0          int64  
 1   pk_cid              int64  
 2   pk_partition        object 
 3   short_term_deposit  int64  
 4   loans               int64  
 5   mortgage            int64  
 6   funds               int64  
 7   securities          int64  
 8   long_term_deposit   int64  
 9   em_account_pp       int64  
 10  credit_card         int64  
 11  payroll             float64
 12  pension_plan        float64
 13  payroll_account     int64  
 14  emc_account         int64  
 15  debit_card          int64  
 16  em_account_p        int64  
 17  em_acount           int64  
dtypes: float64(2), int64(15), object(1)
memory usage: 818.9+ MB


In [109]:
df_cust_prod.head()

Unnamed: 0.1,Unnamed: 0,pk_cid,pk_partition,short_term_deposit,loans,mortgage,funds,securities,long_term_deposit,em_account_pp,credit_card,payroll,pension_plan,payroll_account,emc_account,debit_card,em_account_p,em_acount
0,0,1375586,2018-01,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,1
1,1,1050611,2018-01,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,1
2,2,1050612,2018-01,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,1
3,3,1050613,2018-01,1,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0
4,4,1050614,2018-01,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,1


In [110]:
# Eliminamos la columna 'Unnamed: 0' por ser igual que el index.
df_cust_prod.drop(columns=['Unnamed: 0'], inplace=True, errors='ignore')

In [111]:
# No hay duplicados.
df_cust_prod[df_cust_prod.duplicated(keep=False)]


Unnamed: 0,pk_cid,pk_partition,short_term_deposit,loans,mortgage,funds,securities,long_term_deposit,em_account_pp,credit_card,payroll,pension_plan,payroll_account,emc_account,debit_card,em_account_p,em_acount


In [112]:
# Revisamos porcentaje de valores nulos y encontramos algunos en las columnas 'payroll' y 'pension_plan'.
df_cust_prod.isna().sum()/len(df_cust_prod) * 100

pk_cid                0.000000
pk_partition          0.000000
short_term_deposit    0.000000
loans                 0.000000
mortgage              0.000000
funds                 0.000000
securities            0.000000
long_term_deposit     0.000000
em_account_pp         0.000000
credit_card           0.000000
payroll               0.001023
pension_plan          0.001023
payroll_account       0.000000
emc_account           0.000000
debit_card            0.000000
em_account_p          0.000000
em_acount             0.000000
dtype: float64

In [113]:
# Como los valores nulos corresponden a un porcentaje tan pequeño del total, podrímos optar por eliminarlos directamente.
df_cust_prod = df_cust_prod.dropna()

In [114]:
# Restablecemos los índices del dataframe tras haber eliminado filas.
df_cust_prod.reset_index(drop=True, inplace=True)

In [115]:
# Dejaremos los datos del tipo int a excepción de la columna 'pk_partition' que podríamos considerar quitarla si su información no resulta relevante.
# df_cust_prod = df_cust_prod.drop('pk_partition', axis=1)
# df_cust_prod = df_cust_prod.astype(int)

obj_col = 'pk_partition'
int_col = df_cust_prod.columns[df_cust_prod.columns != obj_col]
df_cust_prod[int_col] = df_cust_prod[int_col].astype(int)

In [116]:
df_cust_prod_clean = df_cust_prod.copy()

In [117]:
df_cust_prod_clean.describe()

Unnamed: 0,pk_cid,short_term_deposit,loans,mortgage,funds,securities,long_term_deposit,em_account_pp,credit_card,payroll,pension_plan,payroll_account,emc_account,debit_card,em_account_p,em_acount
count,5962863.0,5962863.0,5962863.0,5962863.0,5962863.0,5962863.0,5962863.0,5962863.0,5962863.0,5962863.0,5962863.0,5962863.0,5962863.0,5962863.0,5962863.0,5962863.0
mean,1234930.0,0.002581646,7.848579e-05,5.433631e-05,0.003370696,0.003711808,0.01688115,0.0,0.01188456,0.03451144,0.03652641,0.05524527,0.05483289,0.09441203,5.701959e-06,0.734815
std,162302.6,0.05074427,0.008858874,0.007371117,0.05795977,0.06081144,0.1288262,0.0,0.1083666,0.1825388,0.187596,0.2284584,0.2276538,0.2924011,0.002387871,0.4414317
min,15891.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1112532.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1231096.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,1352339.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,1553689.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [118]:
# Observamos una fuerte correlación entre las variables 'payroll' y 'pension_plan', y éstas a su vez con 'payroll_account'.
# Lo que nos indica una alta tendencia a domiciliar los planes de pensiones.
df_num = df_cust_prod_clean.select_dtypes(include='number')
corr = df_num.corr()
corr.style.background_gradient(cmap='coolwarm')

  smin = np.nanmin(gmap) if vmin is None else vmin
  smax = np.nanmax(gmap) if vmax is None else vmax


Unnamed: 0,pk_cid,short_term_deposit,loans,mortgage,funds,securities,long_term_deposit,em_account_pp,credit_card,payroll,pension_plan,payroll_account,emc_account,debit_card,em_account_p,em_acount
pk_cid,1.0,0.040609,-0.0046,-0.008614,-0.042348,-0.035902,-0.050703,,-0.078287,-0.030165,-0.034929,-0.041077,-0.121658,-0.042866,-0.011115,-0.062365
short_term_deposit,0.040609,1.0,-0.000451,-0.000375,0.000976,-0.000986,0.002697,,-0.002865,-0.006197,-0.006382,-0.007558,-0.007129,-0.011183,-0.000121,-0.020108
loans,-0.0046,-0.000451,1.0,-6.5e-05,0.000791,-0.000541,0.001925,,0.036063,0.016578,0.017246,0.016087,0.007595,0.020641,-2.1e-05,-0.005785
mortgage,-0.008614,-0.000375,-6.5e-05,1.0,0.006245,-0.00045,-0.000966,,0.027115,0.027149,0.026338,0.021521,0.006919,0.017773,-1.8e-05,-0.009848
funds,-0.042348,0.000976,0.000791,0.006245,1.0,0.097417,0.148276,,0.039948,0.021119,0.02335,0.026263,0.096035,0.021675,-0.000139,-0.00352
securities,-0.035902,-0.000986,-0.000541,-0.00045,0.097417,1.0,0.030855,,0.054968,0.040084,0.04117,0.040623,0.082815,0.049887,0.019488,0.001833
long_term_deposit,-0.050703,0.002697,0.001925,-0.000966,0.148276,0.030855,1.0,,0.041465,0.017623,0.020917,0.021419,0.183211,0.01387,-0.000313,-0.027843
em_account_pp,,,,,,,,,,,,,,,,
credit_card,-0.078287,-0.002865,0.036063,0.027115,0.039948,0.054968,0.041465,,1.0,0.223382,0.229744,0.234352,0.153994,0.208664,0.002979,-0.091292
payroll,-0.030165,-0.006197,0.016578,0.027149,0.021119,0.040084,0.017623,,0.223382,1.0,0.971012,0.731185,0.141737,0.353162,0.001472,-0.281506
