In [18]:
# Tratamiento de datos
# -----------------------------------------------------------------------
import numpy as np
import pandas as pd
import pickle

# Otros objetivos
# -----------------------------------------------------------------------
import math

# Gráficos
# -----------------------------------------------------------------------
import seaborn as sns
import matplotlib.pyplot as plt

from itertools import product, combinations
from sklearn.preprocessing import MinMaxScaler, Normalizer, StandardScaler, RobustScaler
from sklearn.neighbors import LocalOutlierFactor # para detectar outliers usando el método LOF
from sklearn.ensemble import IsolationForest # para detectar outliers usando el metodo IF

# Para imputar nulos
# -------------------------------------------------------------------------
from sklearn.experimental import enable_iterative_imputer  # Habilita IterativeImputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer

# Para tratar el problema de desbalance
# -----------------------------------------------------------------------
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder #, TargetEncoder # para poder aplicar los métodos de OneHot, Ordinal,  Label y Target Encoder 

from category_encoders import TargetEncoder 

In [19]:
from scipy.stats import chi2_contingency

In [20]:

# Configuración
# -----------------------------------------------------------------------
pd.set_option('display.max_columns', None) # para poder visualizar todas las columnas de los DataFrames

In [21]:
import sys
sys.path.append("../../")

In [22]:
# Insertamos los soportes de nuestro src
from src import soporte_preprocesamiento as sp
from src import soporte_encoding as se
from src import soporte_logistica_preprocesamiento as slp


In [None]:
df=pd.read_pickle("../../datos/modelo3/datos_sin_nulos.pkl").reset_index(drop=True)


In [24]:
df.head()

Unnamed: 0,EnvironmentSatisfaction,JobSatisfaction,WorkLifeBalance,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,Gender,JobLevel,JobRole,MaritalStatus,MonthlyIncome,NumCompaniesWorked,PercentSalaryHike,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,JobInvolvement
0,High,Very High,Good,51.0,0.0,Travel_Rarely,Sales,6.0,2.0,Life Sciences,Female,Intern,Healthcare Representative,Married,1442.76,1.0,11.0,Bad,1.0,6.0,1.0,0.0,High
1,High,Medium,Best,31.0,1.0,Travel_Frequently,Research & Development,10.0,1.0,Life Sciences,Female,Intern,Research Scientist,Single,460.79,0.0,23.0,Good,6.0,3.0,5.0,1.0,Medium
2,Medium,Medium,Bad,32.0,0.0,Travel_Frequently,Research & Development,17.0,4.0,Other,Male,Manager,Sales Executive,Married,2126.08,1.0,15.0,Best,5.0,2.0,5.0,0.0,High
3,Very High,Very High,Better,38.0,0.0,Non-Travel,Research & Development,2.0,5.0,Life Sciences,Male,Senior,Human Resources,Married,915.31,3.0,11.0,Best,13.0,5.0,8.0,7.0,Medium
4,Very High,Low,Better,32.0,0.0,Travel_Rarely,Research & Development,10.0,1.0,Medical,Male,Intern,Sales Executive,Single,257.62,4.0,12.0,Better,9.0,2.0,6.0,0.0,High


Guardamos las variables categóricas del dataframe en una lista.

In [25]:
df_cat=df.select_dtypes("O")
lista_categoricas=df_cat.columns

In [26]:
lista_categoricas


Index(['EnvironmentSatisfaction', 'JobSatisfaction', 'WorkLifeBalance',
       'BusinessTravel', 'Department', 'Education', 'EducationField', 'Gender',
       'JobLevel', 'JobRole', 'MaritalStatus', 'StockOptionLevel',
       'TrainingTimesLastYear', 'JobInvolvement'],
      dtype='object')

In [27]:
sp.detectar_orden_var_cat(df,lista_categoricas,"Attrition")

Evaluando la variable ENVIRONMENTSATISFACTION


Attrition,0.0,1.0
EnvironmentSatisfaction,Unnamed: 1_level_1,Unnamed: 2_level_1
High,434,71
Low,225,74
Medium,259,45
Very High,403,62


La variable EnvironmentSatisfaction tiene orden.
_________________________ 

Evaluando la variable JOBSATISFACTION


Attrition,0.0,1.0
JobSatisfaction,Unnamed: 1_level_1,Unnamed: 2_level_1
High,407,79
Low,232,69
Medium,247,49
Very High,435,55


La variable JobSatisfaction tiene orden.
_________________________ 

Evaluando la variable WORKLIFEBALANCE


Attrition,0.0,1.0
WorkLifeBalance,Unnamed: 1_level_1,Unnamed: 2_level_1
Bad,56,27
Best,128,28
Better,841,136
Good,296,61


La variable WorkLifeBalance tiene orden.
_________________________ 

Evaluando la variable BUSINESSTRAVEL


Attrition,0.0,1.0
BusinessTravel,Unnamed: 1_level_1,Unnamed: 2_level_1
Non-Travel,147,12
Travel_Frequently,220,77
Travel_Rarely,954,163


La variable BusinessTravel tiene orden.
_________________________ 

Evaluando la variable DEPARTMENT


Attrition,0.0,1.0
Department,Unnamed: 1_level_1,Unnamed: 2_level_1
Human Resources,44,22
Research & Development,870,160
Sales,407,70


La variable Department tiene orden.
_________________________ 

Evaluando la variable EDUCATION


Attrition,0.0,1.0
Education,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,154,27
2.0,246,57
3.0,519,96
4.0,358,64
5.0,44,8


La variable Education NO tiene orden.
_________________________ 

Evaluando la variable EDUCATIONFIELD


Attrition,0.0,1.0
EducationField,Unnamed: 1_level_1,Unnamed: 2_level_1
Human Resources,16,12
Life Sciences,547,108
Marketing,141,26
Medical,408,81
Other,80,10
Technical Degree,129,15


La variable EducationField tiene orden.
_________________________ 

Evaluando la variable GENDER


Attrition,0.0,1.0
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,525,95
Male,796,157


La variable Gender NO tiene orden.
_________________________ 

Evaluando la variable JOBLEVEL


Attrition,0.0,1.0
JobLevel,Unnamed: 1_level_1,Unnamed: 2_level_1
Head,63,12
Intern,500,87
Junior,467,104
Manager,94,17
Senior,197,32


La variable JobLevel NO tiene orden.
_________________________ 

Evaluando la variable JOBROLE


Attrition,0.0,1.0
JobRole,Unnamed: 1_level_1,Unnamed: 2_level_1
Healthcare Representative,124,21
Human Resources,47,7
Laboratory Technician,232,46
Manager,91,16
Manufacturing Director,141,16
Research Director,63,22
Research Scientist,254,54
Sales Executive,290,58
Sales Representative,79,12


La variable JobRole NO tiene orden.
_________________________ 

Evaluando la variable MARITALSTATUS


Attrition,0.0,1.0
MaritalStatus,Unnamed: 1_level_1,Unnamed: 2_level_1
Divorced,320,37
Married,635,85
Single,366,130


La variable MaritalStatus tiene orden.
_________________________ 

Evaluando la variable STOCKOPTIONLEVEL


Attrition,0.0,1.0
StockOptionLevel,Unnamed: 1_level_1,Unnamed: 2_level_1
Bad,564,112
Best,75,15
Better,138,28
Good,544,97


La variable StockOptionLevel NO tiene orden.
_________________________ 

Evaluando la variable TRAININGTIMESLASTYEAR


Attrition,0.0,1.0
TrainingTimesLastYear,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,45,10
1.0,64,11
2.0,486,103
3.0,432,90
4.0,116,17
5.0,112,17
6.0,66,4


La variable TrainingTimesLastYear NO tiene orden.
_________________________ 

Evaluando la variable JOBINVOLVEMENT


Attrition,0.0,1.0
JobInvolvement,Unnamed: 1_level_1,Unnamed: 2_level_1
High,789,144
Low,71,19
Medium,333,62
Very High,128,27


La variable JobInvolvement NO tiene orden.
_________________________ 



In [28]:
lista_categoricas

Index(['EnvironmentSatisfaction', 'JobSatisfaction', 'WorkLifeBalance',
       'BusinessTravel', 'Department', 'Education', 'EducationField', 'Gender',
       'JobLevel', 'JobRole', 'MaritalStatus', 'StockOptionLevel',
       'TrainingTimesLastYear', 'JobInvolvement'],
      dtype='object')

Definimos un diccionario con los metodos de enconding y las columnas que asignaremos a cada uno en base a si tienen orden o no.

In [29]:
diccionario_encoding = {
    "onehot": ["Gender", "Education", "JobLevel", "StockOptionLevel"],
    "dummies": [],
    "ordinal": {},
    "label": [],
    "frequency": [],
    "target": ["BusinessTravel", "Department", "JobRole", "MaritalStatus", "TrainingTimesLastYear", "EnvironmentSatisfaction", "JobSatisfaction", "WorkLifeBalance", "JobInvolvement", "EducationField"]
}

In [30]:
encoding = se.Encoding(df, diccionario_encoding, "Attrition")

In [31]:
df_one_hot = encoding.one_hot_encoding()
df_one_hot.head()

Unnamed: 0,EnvironmentSatisfaction,JobSatisfaction,WorkLifeBalance,Age,Attrition,BusinessTravel,Department,DistanceFromHome,EducationField,JobRole,MaritalStatus,MonthlyIncome,NumCompaniesWorked,PercentSalaryHike,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,JobInvolvement,Gender_Female,Gender_Male,Education_1.0,Education_2.0,Education_3.0,Education_4.0,Education_5.0,JobLevel_Head,JobLevel_Intern,JobLevel_Junior,JobLevel_Manager,JobLevel_Senior,StockOptionLevel_Bad,StockOptionLevel_Best,StockOptionLevel_Better,StockOptionLevel_Good
0,High,Very High,Good,51.0,0.0,Travel_Rarely,Sales,6.0,Life Sciences,Healthcare Representative,Married,1442.76,1.0,11.0,1.0,6.0,1.0,0.0,High,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,High,Medium,Best,31.0,1.0,Travel_Frequently,Research & Development,10.0,Life Sciences,Research Scientist,Single,460.79,0.0,23.0,6.0,3.0,5.0,1.0,Medium,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,Medium,Medium,Bad,32.0,0.0,Travel_Frequently,Research & Development,17.0,Other,Sales Executive,Married,2126.08,1.0,15.0,5.0,2.0,5.0,0.0,High,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3,Very High,Very High,Better,38.0,0.0,Non-Travel,Research & Development,2.0,Life Sciences,Human Resources,Married,915.31,3.0,11.0,13.0,5.0,8.0,7.0,Medium,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4,Very High,Low,Better,32.0,0.0,Travel_Rarely,Research & Development,10.0,Medical,Sales Executive,Single,257.62,4.0,12.0,9.0,2.0,6.0,0.0,High,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [32]:
target_encoder = TargetEncoder(cols=["BusinessTravel", "Department", "JobRole", "MaritalStatus", "TrainingTimesLastYear", "EnvironmentSatisfaction", "JobSatisfaction", "WorkLifeBalance", "JobInvolvement", "EducationField"])
df_encoded = target_encoder.fit_transform(df_one_hot, df_one_hot["Attrition"])
df_encoded.head()

Unnamed: 0,EnvironmentSatisfaction,JobSatisfaction,WorkLifeBalance,Age,Attrition,BusinessTravel,Department,DistanceFromHome,EducationField,JobRole,MaritalStatus,MonthlyIncome,NumCompaniesWorked,PercentSalaryHike,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,JobInvolvement,Gender_Female,Gender_Male,Education_1.0,Education_2.0,Education_3.0,Education_4.0,Education_5.0,JobLevel_Head,JobLevel_Intern,JobLevel_Junior,JobLevel_Manager,JobLevel_Senior,StockOptionLevel_Bad,StockOptionLevel_Best,StockOptionLevel_Better,StockOptionLevel_Good
0,0.140594,0.112245,0.170868,51.0,0.0,0.145927,0.146751,6.0,0.164885,0.144828,0.118056,1442.76,1.0,11.0,1.0,0.057833,1.0,0.0,0.154341,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.140594,0.165541,0.179487,31.0,1.0,0.259259,0.15534,10.0,0.164885,0.175325,0.262097,460.79,0.0,23.0,6.0,0.172414,5.0,1.0,0.156962,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.148026,0.165541,0.324999,32.0,0.0,0.259259,0.15534,17.0,0.111156,0.166667,0.118056,2126.08,1.0,15.0,5.0,0.174873,5.0,0.0,0.154341,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3,0.133333,0.112245,0.139202,38.0,0.0,0.075472,0.15534,2.0,0.164885,0.130617,0.118056,915.31,3.0,11.0,13.0,0.131783,8.0,7.0,0.156962,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4,0.133333,0.229236,0.139202,32.0,0.0,0.145927,0.15534,10.0,0.165644,0.166667,0.262097,257.62,4.0,12.0,9.0,0.174873,6.0,0.0,0.154341,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [33]:
df_encoded.columns

Index(['EnvironmentSatisfaction', 'JobSatisfaction', 'WorkLifeBalance', 'Age',
       'Attrition', 'BusinessTravel', 'Department', 'DistanceFromHome',
       'EducationField', 'JobRole', 'MaritalStatus', 'MonthlyIncome',
       'NumCompaniesWorked', 'PercentSalaryHike', 'TotalWorkingYears',
       'TrainingTimesLastYear', 'YearsAtCompany', 'YearsSinceLastPromotion',
       'JobInvolvement', 'Gender_Female', 'Gender_Male', 'Education_1.0',
       'Education_2.0', 'Education_3.0', 'Education_4.0', 'Education_5.0',
       'JobLevel_Head', 'JobLevel_Intern', 'JobLevel_Junior',
       'JobLevel_Manager', 'JobLevel_Senior', 'StockOptionLevel_Bad',
       'StockOptionLevel_Best', 'StockOptionLevel_Better',
       'StockOptionLevel_Good'],
      dtype='object')

In [None]:
with open('../../datos/modelo3/datos_encoded.pkl', 'wb') as f:
    pickle.dump(df_encoded, f)