In [1]:
# Tratamiento de datos
# -----------------------------------------------------------------------
import numpy as np
import pandas as pd
import pickle

# Otros objetivos
# -----------------------------------------------------------------------
import math

# Gráficos
# -----------------------------------------------------------------------
import seaborn as sns
import matplotlib.pyplot as plt

from itertools import product, combinations
from sklearn.preprocessing import MinMaxScaler, Normalizer, StandardScaler, RobustScaler
from sklearn.neighbors import LocalOutlierFactor # para detectar outliers usando el método LOF
from sklearn.ensemble import IsolationForest # para detectar outliers usando el metodo IF

# Para imputar nulos
# -------------------------------------------------------------------------
from sklearn.experimental import enable_iterative_imputer  # Habilita IterativeImputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer

# Para tratar el problema de desbalance
# -----------------------------------------------------------------------
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder #, TargetEncoder # para poder aplicar los métodos de OneHot, Ordinal,  Label y Target Encoder 

from category_encoders import TargetEncoder 

# Configuración
# -----------------------------------------------------------------------
pd.set_option('display.max_columns', None) # para poder visualizar todas las columnas de los DataFrames


In [2]:
import sys
import os
sys.path.append(os.path.abspath('../'))

In [3]:
# Importación de las clases y funciones creadas en nuestro archivo de soporte
# -----------------------------------------------------------------------
from src import soporte_preprocesamiento as sp
from src import soporte_encoding as se
from src import soporte_logistica_preprocesamiento as slp

In [4]:
import warnings
warnings.filterwarnings("ignore")

In [5]:
df=pd.read_pickle("../datos/datos_encoded.pkl").reset_index(drop=True)


In [6]:
df.head(2)

Unnamed: 0,EnvironmentSatisfaction,JobSatisfaction,WorkLifeBalance,Age,Attrition,BusinessTravel,Department,DistanceFromHome,EducationField,JobRole,MaritalStatus,MonthlyIncome,NumCompaniesWorked,PercentSalaryHike,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,JobInvolvement,Gender_Female,Gender_Male,Education_1.0,Education_2.0,Education_3.0,Education_4.0,Education_5.0,JobLevel_Head,JobLevel_Intern,JobLevel_Junior,JobLevel_Manager,JobLevel_Senior,StockOptionLevel_Bad,StockOptionLevel_Best,StockOptionLevel_Better,StockOptionLevel_Good
0,0.137778,0.114119,0.167812,51.0,0.0,0.149569,0.150224,6.0,0.166667,0.145038,0.124814,1442.76,1.0,11.0,1.0,0.061538,1.0,0.0,0.153226,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.137778,0.164286,0.178414,31.0,1.0,0.249097,0.157128,10.0,0.166667,0.181507,0.255319,460.79,0.0,23.0,6.0,0.175153,5.0,1.0,0.16,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4410 entries, 0 to 4409
Data columns (total 35 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   EnvironmentSatisfaction  4410 non-null   float64
 1   JobSatisfaction          4410 non-null   float64
 2   WorkLifeBalance          4410 non-null   float64
 3   Age                      4410 non-null   float64
 4   Attrition                4410 non-null   float64
 5   BusinessTravel           4410 non-null   float64
 6   Department               4410 non-null   float64
 7   DistanceFromHome         4410 non-null   float64
 8   EducationField           4410 non-null   float64
 9   JobRole                  4410 non-null   float64
 10  MaritalStatus            4410 non-null   float64
 11  MonthlyIncome            4410 non-null   float64
 12  NumCompaniesWorked       4410 non-null   float64
 13  PercentSalaryHike        4410 non-null   float64
 14  TotalWorkingYears       

En el EDA detectamos columnas con presencia de outliers univariados, así que, para tratar de que influyan lo menos posible vamos a usar el RoubustScaler como método de escalado de los datos.

In [8]:
# Llamamos a la funión de escalado de nuestro src, que por defecto si no lepasamos otro parámetro usa el RobustScaler()
df_robust, scaler = sp.escalar_datos(data = df, cols = df.columns.drop("Attrition"))
df_robust = pd.concat([df["Attrition"],df_robust], axis = 1)

In [9]:
df_robust.head()

Unnamed: 0,Attrition,EnvironmentSatisfaction,JobSatisfaction,WorkLifeBalance,Age,BusinessTravel,Department,DistanceFromHome,EducationField,JobRole,MaritalStatus,MonthlyIncome,NumCompaniesWorked,PercentSalaryHike,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,JobInvolvement,Gender_Female,Gender_Male,Education_1.0,Education_2.0,Education_3.0,Education_4.0,Education_5.0,JobLevel_Head,JobLevel_Intern,JobLevel_Junior,JobLevel_Manager,JobLevel_Senior,StockOptionLevel_Bad,StockOptionLevel_Best,StockOptionLevel_Better,StockOptionLevel_Good
0,0.0,0.0,-0.975743,1.0,1.153846,0.0,-1.0,-0.083333,0.533046,-0.463706,0.0,1.498811,-0.333333,-0.5,-1.0,-3.415572,-0.666667,-0.333333,0.0,1.0,-1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,1.0,0.0,0.0,1.424875,-0.384615,0.099529,0.0,0.25,0.533046,0.523841,1.0,-0.13348,-0.666667,1.5,-0.444444,0.102377,0.0,0.0,1.0,1.0,-1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.765804,0.0,6.850501,-0.307692,0.099529,0.0,0.833333,-4.206791,0.177356,0.0,2.634668,-0.333333,0.166667,-0.555556,0.0,0.0,-0.333333,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3,0.0,-0.234196,-0.975743,0.0,0.153846,-0.069569,0.0,-0.416667,0.533046,-0.745947,0.0,0.622052,0.333333,-0.5,0.333333,-0.897623,0.5,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4,0.0,-0.234196,1.260038,0.0,-0.307692,0.0,0.0,0.25,0.0,0.177356,1.0,-0.471201,0.666667,-0.333333,-0.111111,0.0,0.166667,-0.333333,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [10]:
df_robust.to_pickle("../datos/datos_scaled.pkl")