# Importación de librerías

In [1]:
# Librerías estándar
import os
import sys
import warnings
from dotenv import load_dotenv

# Manipulación de datos
import pandas as pd
import numpy as np

# Configuración de warnings
warnings.filterwarnings('ignore')

# Análisis de nulos
import missingno as msno

# Visualización de datos
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Textos
import unicodedata
from fuzzywuzzy import process
import re

# # Funciones personalizadas
import funciones_data_analisis as fda
import funciones_fintech as ff

[NbConvertApp] Converting notebook funciones_data_analisis.ipynb to script
[NbConvertApp] Writing 9494 bytes to funciones_data_analisis.py


# Ruta archivos



Cargar variables de entorno

In [2]:
load_dotenv()
# Definir ruta donde están los ficheros
path = os.getenv("DATA_PATH")

# Carga de datos

In [3]:
# Obtiene los nombres de los archivos .csv, .xlsx y .xls
files = fda.carga_archivos(path)
files

['bank-additional_bank-additional-full.csv']

In [4]:
# Creamos un dicionario con los DataFrames de los archivos
dict_data = fda.leer_archivos(files,path)

bank-additional_bank-additional-full.csv: (41188, 21)


In [5]:
dict_data.keys()

dict_keys(['bank-additional_bank-additional-full.csv'])

# Exploración inicial de los datos



In [6]:
# Creamos un dicionario vacio para la exploracion
dict_exploracion = {}

for k, v in dict_data.items():
    print(f"EXPLORANDO: {k}")
    print("="*120)
    
    # Ejecutamos y guardamos la exploracion.
    exploracion = fda.exploracion_datos(v)
    dict_exploracion[k] = exploracion
    
    print("\n" + "="*120 + "\n")

EXPLORANDO: bank-additional_bank-additional-full.csv
Exploración inicial de datos:
****************************************************************************************************
El numero de filas es: 41188
El numero de columnas es: 21
****************************************************************************************************
Las 5 primeras filas del dataframe son:


Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


****************************************************************************************************
Las 5 últimas filas del dataframe son:


Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
41183,73,retired,married,professional.course,no,yes,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes
41184,46,blue-collar,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41185,56,retired,married,university.degree,no,yes,no,cellular,nov,fri,...,2,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41186,44,technician,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes
41187,74,retired,married,professional.course,no,yes,no,cellular,nov,fri,...,3,999,1,failure,-1.1,94.767,-50.8,1.028,4963.6,no


****************************************************************************************************
Muestra aleatoria de 5 filas del dataframe:


Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
7776,58,blue-collar,married,basic.9y,no,no,no,telephone,jun,mon,...,3,999,0,nonexistent,1.4,94.465,-41.8,4.865,5228.1,no
30438,46,technician,married,high.school,no,no,no,cellular,may,mon,...,5,3,1,success,-1.8,92.893,-46.2,1.354,5099.1,no
13364,60,unknown,married,basic.4y,unknown,yes,no,cellular,jul,wed,...,1,999,0,nonexistent,1.4,93.918,-42.7,4.962,5228.1,no
19311,48,admin.,divorced,university.degree,no,yes,no,cellular,aug,wed,...,3,999,0,nonexistent,1.4,93.444,-36.1,4.967,5228.1,no
17899,53,management,single,professional.course,unknown,no,no,cellular,jul,tue,...,5,999,0,nonexistent,1.4,93.918,-42.7,4.961,5228.1,no


****************************************************************************************************
Estadísticos descriptivos del dataframe:


Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
count,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0
mean,40.02406,258.28501,2.567593,962.475454,0.172963,0.081886,93.575664,-40.5026,3.621291,5167.035911
std,10.42125,259.279249,2.770014,186.910907,0.494901,1.57096,0.57884,4.628198,1.734447,72.251528
min,17.0,0.0,1.0,0.0,0.0,-3.4,92.201,-50.8,0.634,4963.6
25%,32.0,102.0,1.0,999.0,0.0,-1.8,93.075,-42.7,1.344,5099.1
50%,38.0,180.0,2.0,999.0,0.0,1.1,93.749,-41.8,4.857,5191.0
75%,47.0,319.0,3.0,999.0,0.0,1.4,93.994,-36.4,4.961,5228.1
max,98.0,4918.0,56.0,999.0,7.0,1.4,94.767,-26.9,5.045,5228.1


****************************************************************************************************
Resumen de tipología de datos, visualización de nulos y valores únicos:


Unnamed: 0,Tipos de datos,Nulos,Porcentaje Nulos,Numero valores unicos,Porcentaje valores unicos,Valores unicos
age,int64,0,0.0,78,0.189376,"[56, 57, 37, 40, 45, 59, 41, 24, 25, 29, 35, 5..."
job,object,0,0.0,12,0.029135,"[housemaid, services, admin., blue-collar, tec..."
marital,object,0,0.0,4,0.009712,"[married, single, divorced, unknown]"
education,object,0,0.0,8,0.019423,"[basic.4y, high.school, basic.6y, basic.9y, pr..."
default,object,0,0.0,3,0.007284,"[no, unknown, yes]"
housing,object,0,0.0,3,0.007284,"[no, yes, unknown]"
loan,object,0,0.0,3,0.007284,"[no, yes, unknown]"
contact,object,0,0.0,2,0.004856,"[telephone, cellular]"
month,object,0,0.0,10,0.024279,"[may, jun, jul, aug, oct, nov, dec, mar, apr, ..."
day_of_week,object,0,0.0,5,0.012139,"[mon, tue, wed, thu, fri]"


****************************************************************************************************




# Limpieza de datos

In [7]:
# Asignamos a la variable df_bank el archivo bank-additional_bank-additional-full.csv
df_bank = dict_data['bank-additional_bank-additional-full.csv']

Renombramos las columnas para que sean mas entendibles

In [8]:
nombre_columnas = ['age', 'job', 'marital_status', 'education', 'credit_default', 'housing_loan', 'personal_loan',
   'contact_type', 'last_contact_month', 'last_contact_day', 'last_contact_duration_secs', 
   'number_contacts', 'number_days_last_contact','numbrer_of_previous_contacts', 
   'outcome_previous_campaign', 'employement_vaiation_rate', 'consumer_price_index','consumer_confidence_index', 
   'euribor_3m', 'number_employees', 'subscribed_term_deposit']
df_bank.columns = nombre_columnas

In [9]:
# Renombramos todas las apariciones de admin a administrative_staff
df_bank['job'] = df_bank['job'].replace('admin', 'administrative_staff')

# Ejecutamos las funciones de normalizar textos y crear las columnas categoricas.
df_bank = fda.normalizar_textos(df_bank)
df_bank = ff.columnas_categoricas(df_bank)



In [10]:
# Creamos una copia para pasar la funcion normalizar_binario de la columna subscribed_term_deposit.
df_binario = df_bank.copy()
fda.normalizar_binario(df_binario,"subscribed_term_deposit")

Unnamed: 0,age,job,marital_status,education,credit_default,housing_loan,personal_loan,contact_type,last_contact_month,last_contact_day,...,number_contacts,number_days_last_contact,numbrer_of_previous_contacts,outcome_previous_campaign,employement_vaiation_rate,consumer_price_index,consumer_confidence_index,euribor_3m,number_employees,subscribed_term_deposit
0,56,housemaid,married,basic_4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
1,57,services,married,high_school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
2,37,services,married,high_school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
3,40,,married,basic_6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
4,56,services,married,high_school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,retired,married,professional_course,no,yes,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,1
41184,46,blue_collar,married,professional_course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,0
41185,56,retired,married,university_degree,no,yes,no,cellular,nov,fri,...,2,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,0
41186,44,technician,married,professional_course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,1


In [11]:
df_binario.dtypes

age                                int64
job                             category
marital_status                  category
education                       category
credit_default                  category
housing_loan                    category
personal_loan                   category
contact_type                    category
last_contact_month              category
last_contact_day                category
last_contact_duration_secs         int64
number_contacts                    int64
number_days_last_contact           int64
numbrer_of_previous_contacts       int64
outcome_previous_campaign       category
employement_vaiation_rate        float64
consumer_price_index             float64
consumer_confidence_index        float64
euribor_3m                       float64
number_employees                 float64
subscribed_term_deposit            int64
dtype: object