In [9]:
# Tratamiento de datos
# -----------------------------------------------------------------------
import numpy as np
import pandas as pd
import pickle

# Otros objetivos
# -----------------------------------------------------------------------
import math

# Gráficos
# -----------------------------------------------------------------------
import seaborn as sns
import matplotlib.pyplot as plt

from itertools import product, combinations
from sklearn.preprocessing import MinMaxScaler, Normalizer, StandardScaler, RobustScaler
from sklearn.neighbors import LocalOutlierFactor # para detectar outliers usando el método LOF
from sklearn.ensemble import IsolationForest # para detectar outliers usando el metodo IF

# Para imputar nulos
# -------------------------------------------------------------------------
from sklearn.experimental import enable_iterative_imputer  # Habilita IterativeImputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer

# Para tratar el problema de desbalance
# -----------------------------------------------------------------------
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder #, TargetEncoder # para poder aplicar los métodos de OneHot, Ordinal,  Label y Target Encoder 

from category_encoders import TargetEncoder 

# Configuración
# -----------------------------------------------------------------------
pd.set_option('display.max_columns', None) # para poder visualizar todas las columnas de los DataFrames


In [10]:
import sys
sys.path.append("../../")

In [11]:
# Importación de las clases y funciones creadas en nuestro archivo de soporte
# -----------------------------------------------------------------------
from src import soporte_preprocesamiento as sp
from src import soporte_encoding as se
from src import soporte_logistica_preprocesamiento as slp

In [12]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
df=pd.read_pickle("../../datos/modelo3/datos_encoded.pkl").reset_index(drop=True)


In [14]:
df.head(2)

Unnamed: 0,EnvironmentSatisfaction,JobSatisfaction,WorkLifeBalance,Age,Attrition,BusinessTravel,Department,DistanceFromHome,EducationField,JobRole,MaritalStatus,MonthlyIncome,NumCompaniesWorked,PercentSalaryHike,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,JobInvolvement,Gender_Female,Gender_Male,Education_1.0,Education_2.0,Education_3.0,Education_4.0,Education_5.0,JobLevel_Head,JobLevel_Intern,JobLevel_Junior,JobLevel_Manager,JobLevel_Senior,StockOptionLevel_Bad,StockOptionLevel_Best,StockOptionLevel_Better,StockOptionLevel_Good
0,0.140594,0.112245,0.170868,51.0,0.0,0.145927,0.146751,6.0,0.164885,0.144828,0.118056,1442.76,1.0,11.0,1.0,0.057833,1.0,0.0,0.154341,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.140594,0.165541,0.179487,31.0,1.0,0.259259,0.15534,10.0,0.164885,0.175325,0.262097,460.79,0.0,23.0,6.0,0.172414,5.0,1.0,0.156962,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1573 entries, 0 to 1572
Data columns (total 35 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   EnvironmentSatisfaction  1573 non-null   float64
 1   JobSatisfaction          1573 non-null   float64
 2   WorkLifeBalance          1573 non-null   float64
 3   Age                      1573 non-null   float64
 4   Attrition                1573 non-null   float64
 5   BusinessTravel           1573 non-null   float64
 6   Department               1573 non-null   float64
 7   DistanceFromHome         1573 non-null   float64
 8   EducationField           1573 non-null   float64
 9   JobRole                  1573 non-null   float64
 10  MaritalStatus            1573 non-null   float64
 11  MonthlyIncome            1573 non-null   float64
 12  NumCompaniesWorked       1573 non-null   float64
 13  PercentSalaryHike        1573 non-null   float64
 14  TotalWorkingYears       

En el EDA detectamos columnas con presencia de outliers univariados, así que, para tratar de que influyan lo menos posible vamos a usar el RoubustScaler como método de escalado de los datos.

In [16]:
# Llamamos a la funión de escalado de nuestro src, que por defecto si no lepasamos otro parámetro usa el RobustScaler()
df_robust, scaler = sp.escalar_datos(data = df, cols = df.columns.drop("Attrition"))
df_robust = pd.concat([df["Attrition"],df_robust], axis = 1)

In [17]:
df_robust.head()

Unnamed: 0,Attrition,EnvironmentSatisfaction,JobSatisfaction,WorkLifeBalance,Age,BusinessTravel,Department,DistanceFromHome,EducationField,JobRole,MaritalStatus,MonthlyIncome,NumCompaniesWorked,PercentSalaryHike,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,JobInvolvement,Gender_Female,Gender_Male,Education_1.0,Education_2.0,Education_3.0,Education_4.0,Education_5.0,JobLevel_Head,JobLevel_Intern,JobLevel_Junior,JobLevel_Manager,JobLevel_Senior,StockOptionLevel_Bad,StockOptionLevel_Best,StockOptionLevel_Better,StockOptionLevel_Good
0,0.0,0.0,-0.943915,1.0,1.153846,0.0,-1.0,-0.083333,0.0,-0.945096,0.0,1.498997,-0.333333,-0.5,-1.0,-4.070248,-0.666667,-0.333333,0.0,1.0,-1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,1.0,0.0,0.056085,1.272173,-0.384615,0.113333,0.0,0.25,0.0,0.45135,1.0,-0.129721,-0.666667,1.5,-0.444444,0.0,0.0,0.0,1.0,1.0,-1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.505837,0.056085,5.867264,-0.307692,0.113333,0.0,0.833333,-5.396957,0.054904,0.0,2.632366,-0.333333,0.166667,-0.555556,0.087346,0.0,-0.333333,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3,0.0,-0.494163,-0.943915,0.0,0.153846,-0.070455,0.0,-0.416667,0.0,-1.595795,0.0,0.624156,0.333333,-0.5,0.333333,-1.443304,0.5,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4,0.0,-0.494163,1.251217,0.0,-0.307692,0.0,0.0,0.25,0.076206,0.054904,1.0,-0.466703,0.666667,-0.333333,-0.111111,0.087346,0.166667,-0.333333,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [None]:
df_robust.to_pickle("../../datos/modelo3/datos_scaled.pkl")