In [1]:
import warnings
warnings.filterwarnings("ignore")

# imports best practice pandas
import os

import gzip
import numpy as np
import pandas as pd
import missingno as msno
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()
%matplotlib inline
import math
import pickle
import joblib
import dill
import inspect

#--------------------------------------------------------
# imports best practice sklearn
import sklearn
from sklearn.feature_selection import VarianceThreshold
from sklearn import set_config

from sklearn.tree import DecisionTreeClassifier

# preprocessing
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, LabelEncoder, OneHotEncoder, RobustScaler, MinMaxScaler
from scipy import stats

# transformers
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import FunctionTransformer


from sklearn.cluster import KMeans

# evaluacion
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import metrics

# pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
set_config(transform_output = "pandas")

#models
from sklearn.cluster import KMeans

In [2]:
# cargamos las funciones y el pipeline

CWD = os.getcwd()
FUNCTIONS_PATH = os.path.join(CWD, r"functions")

with open(os.path.join(FUNCTIONS_PATH, 'convertir_deceased_a_boolean.joblib'), 'rb') as io:
        convertir_deceased_a_boolean=dill.load(io)

with open(os.path.join(FUNCTIONS_PATH, 'bfill_ffill_for_all_columns_with_NaNs.joblib'), 'rb') as io:
        bfill_ffill_for_all_columns_with_NaNs=dill.load(io)

with open(os.path.join(FUNCTIONS_PATH, 'imputer_for_region_code.joblib'), 'rb') as io:
        imputer_for_region_code=dill.load(io)
        
with open(os.path.join(FUNCTIONS_PATH, 'assign_oldest_entry_date.joblib'), 'rb') as io:
        assign_oldest_entry_date=dill.load(io)
        
with open(os.path.join(FUNCTIONS_PATH, 'imputer_for_missing_salary_and_non_missing_segment.joblib'), 'rb') as io:
        imputer_for_missing_salary_and_non_missing_segment=dill.load(io)
        
with open(os.path.join(FUNCTIONS_PATH, 'imputer_for_missing_salary_and_missing_segment.joblib'), 'rb') as io:
        imputer_for_missing_salary_and_missing_segment=dill.load(io)
        
with open(os.path.join(FUNCTIONS_PATH, 'imputer_for_non_missing_salary_and_missing_segment.joblib'), 'rb') as io:
        imputer_for_non_missing_salary_and_missing_segment=dill.load(io)
        
with open(os.path.join(FUNCTIONS_PATH, 'imputer_for_entry_channel.joblib'), 'rb') as io:
        imputer_for_entry_channel=dill.load(io)
        
with open(os.path.join(FUNCTIONS_PATH, 'imputer_for_gender.joblib'), 'rb') as io:
        imputer_for_gender=dill.load(io)

with open(os.path.join(FUNCTIONS_PATH, 'imputer_for_gender.joblib'), 'rb') as io:
        imputer_for_gender=dill.load(io)        

with open(os.path.join(FUNCTIONS_PATH, 'correct_data_types_and_drop_columns.joblib'), 'rb') as io:
        correct_data_types_and_drop_columns=dill.load(io)
        
PIPE_PATH = os.path.join(CWD, "pipeline")

pipe = pickle.load(open(os.path.join(PIPE_PATH, 'pipeline.pkl'), 'rb'))

pipe

In [3]:
DATA_PATH = "../../data/processed/"
FILE_NAME = "df_capstone_merged.pkl.gz"
PICKLE_PATH = DATA_PATH+FILE_NAME



with gzip.open(PICKLE_PATH, 'rb') as gz_csv_df:

    df_capstone = pd.read_pickle(gz_csv_df)

In [4]:
df_capstone.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5962924 entries, 0 to 5962923
Data columns (total 27 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   pk_cid              5962924 non-null  int64  
 1   pk_partition        5962924 non-null  object 
 2   country_id          5962924 non-null  object 
 3   region_code         5960660 non-null  float64
 4   gender              5962899 non-null  object 
 5   age                 5962924 non-null  int64  
 6   deceased            5962924 non-null  object 
 7   salary              4450821 non-null  float64
 8   entry_date          5962924 non-null  object 
 9   entry_channel       5829891 non-null  object 
 10  active_customer     5962924 non-null  float64
 11  segment             5828980 non-null  object 
 12  short_term_deposit  5962924 non-null  int64  
 13  loans               5962924 non-null  int64  
 14  mortgage            5962924 non-null  int64  
 15  funds          

In [5]:
def listarNulos(dataset):
    df_stat = dataset.isnull().sum()[(dataset.isnull().sum()>0)].sort_values(ascending=False).to_frame()
    df_stat.rename(columns={0: 'Nulos'}, inplace = True)
    percent = (df_stat["Nulos"]/dataset.shape[0]).mul(100)
    df_stat["porcentaje"] = percent.round(2)
    return df_stat

In [6]:
listarNulos(df_capstone)

Unnamed: 0,Nulos,porcentaje
salary,1512103,25.36
segment,133944,2.25
entry_channel,133033,2.23
region_code,2264,0.04
payroll,61,0.0
pension_plan,61,0.0
gender,25,0.0


In [7]:
df_capstone_clean = pipe.transform(df_capstone)

In [8]:
df_capstone_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5962924 entries, 0 to 5962923
Data columns (total 28 columns):
 #   Column              Dtype         
---  ------              -----         
 0   pk_cid              int64         
 1   pk_partition        datetime64[ns]
 2   country_id          object        
 3   region_code         object        
 4   gender              object        
 5   age                 int64         
 6   deceased            int64         
 7   salary              float64       
 8   entry_date          datetime64[ns]
 9   entry_channel       object        
 10  active_customer     int64         
 11  segment             object        
 12  short_term_deposit  int64         
 13  loans               int64         
 14  mortgage            int64         
 15  funds               int64         
 16  securities          int64         
 17  long_term_deposit   int64         
 18  em_account_pp       int64         
 19  credit_card         int64         
 20  pa

In [9]:
listarNulos(df_capstone_clean)

Unnamed: 0,Nulos,porcentaje


In [10]:
DATA_PATH = "../../data/processed/"
FILE_NAME = "df_capstone_clean.pkl.gz"
PICKLE_PATH = DATA_PATH+FILE_NAME

df_capstone_clean.to_pickle(PICKLE_PATH, compression='gzip')