In [119]:
# Tratamiento de datos
# -----------------------------------------------------------------------
import numpy as np
import pandas as pd
import pickle

# Otros objetivos
# -----------------------------------------------------------------------
import math

# Gráficos
# -----------------------------------------------------------------------
import seaborn as sns
import matplotlib.pyplot as plt

from itertools import product, combinations
from sklearn.preprocessing import MinMaxScaler, Normalizer, StandardScaler, RobustScaler
from sklearn.neighbors import LocalOutlierFactor # para detectar outliers usando el método LOF
from sklearn.ensemble import IsolationForest # para detectar outliers usando el metodo IF

# Para imputar nulos
# -------------------------------------------------------------------------
from sklearn.experimental import enable_iterative_imputer  # Habilita IterativeImputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer

# Para tratar el problema de desbalance
# -----------------------------------------------------------------------
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder #, TargetEncoder # para poder aplicar los métodos de OneHot, Ordinal,  Label y Target Encoder 

from category_encoders import TargetEncoder 

In [120]:
from scipy.stats import chi2_contingency

In [121]:

# Configuración
# -----------------------------------------------------------------------
pd.set_option('display.max_columns', None) # para poder visualizar todas las columnas de los DataFrames

In [122]:
import sys
import os
sys.path.append(os.path.abspath('../'))

In [123]:
# Insertamos los soportes de nuestro src
from src import soporte_preprocesamiento as sp
from src import soporte_encoding as se
from src import soporte_logistica_preprocesamiento as slp


In [124]:
df=pd.read_pickle("../datos/datos_sin_nulos.pkl").reset_index(drop=True)


In [125]:
df.head()

Unnamed: 0,EnvironmentSatisfaction,JobSatisfaction,WorkLifeBalance,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,Gender,JobLevel,JobRole,MaritalStatus,MonthlyIncome,NumCompaniesWorked,PercentSalaryHike,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,JobInvolvement
0,High,Very High,Good,51.0,0.0,Travel_Rarely,Sales,6.0,2.0,Life Sciences,Female,Intern,Healthcare Representative,Married,1442.76,1.0,11.0,Bad,1.0,6.0,1.0,0.0,High
1,High,Medium,Best,31.0,1.0,Travel_Frequently,Research & Development,10.0,1.0,Life Sciences,Female,Intern,Research Scientist,Single,460.79,0.0,23.0,Good,6.0,3.0,5.0,1.0,Medium
2,Medium,Medium,Bad,32.0,0.0,Travel_Frequently,Research & Development,17.0,4.0,Other,Male,Manager,Sales Executive,Married,2126.08,1.0,15.0,Best,5.0,2.0,5.0,0.0,High
3,Very High,Very High,Better,38.0,0.0,Non-Travel,Research & Development,2.0,5.0,Life Sciences,Male,Senior,Human Resources,Married,915.31,3.0,11.0,Best,13.0,5.0,8.0,7.0,Medium
4,Very High,Low,Better,32.0,0.0,Travel_Rarely,Research & Development,10.0,1.0,Medical,Male,Intern,Sales Executive,Single,257.62,4.0,12.0,Better,9.0,2.0,6.0,0.0,High


Guardamos las variables categóricas del dataframe en una lista.

In [126]:
df_cat=df.select_dtypes("O")
lista_categoricas=df_cat.columns

In [127]:
lista_categoricas


Index(['EnvironmentSatisfaction', 'JobSatisfaction', 'WorkLifeBalance',
       'BusinessTravel', 'Department', 'Education', 'EducationField', 'Gender',
       'JobLevel', 'JobRole', 'MaritalStatus', 'StockOptionLevel',
       'TrainingTimesLastYear', 'JobInvolvement'],
      dtype='object')

In [128]:
sp.detectar_orden_var_cat(df,lista_categoricas,"Attrition")

Evaluando la variable ENVIRONMENTSATISFACTION


Attrition,0.0,1.0
EnvironmentSatisfaction,Unnamed: 1_level_1,Unnamed: 2_level_1
High,1164,186
Low,632,213
Medium,728,128
Very High,1155,179


La variable EnvironmentSatisfaction tiene orden.
_________________________ 

Evaluando la variable JOBSATISFACTION


Attrition,0.0,1.0
JobSatisfaction,Unnamed: 1_level_1,Unnamed: 2_level_1
High,1104,219
Low,663,197
Medium,702,138
Very High,1211,156


La variable JobSatisfaction tiene orden.
_________________________ 

Evaluando la variable WORKLIFEBALANCE


Attrition,0.0,1.0
WorkLifeBalance,Unnamed: 1_level_1,Unnamed: 2_level_1
Bad,164,75
Best,373,81
Better,2280,380
Good,848,171


La variable WorkLifeBalance tiene orden.
_________________________ 

Evaluando la variable BUSINESSTRAVEL


Attrition,0.0,1.0
BusinessTravel,Unnamed: 1_level_1,Unnamed: 2_level_1
Non-Travel,414,36
Travel_Frequently,624,207
Travel_Rarely,2661,468


La variable BusinessTravel tiene orden.
_________________________ 

Evaluando la variable DEPARTMENT


Attrition,0.0,1.0
Department,Unnamed: 1_level_1,Unnamed: 2_level_1
Human Resources,132,57
Research & Development,2430,453
Sales,1137,201


La variable Department tiene orden.
_________________________ 

Evaluando la variable EDUCATION


Attrition,0.0,1.0
Education,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,432,78
2.0,687,159
3.0,1449,267
4.0,1008,186
5.0,123,21


La variable Education NO tiene orden.
_________________________ 

Evaluando la variable EDUCATIONFIELD


Attrition,0.0,1.0
EducationField,Unnamed: 1_level_1,Unnamed: 2_level_1
Human Resources,48,33
Life Sciences,1515,303
Marketing,402,75
Medical,1167,225
Other,216,30
Technical Degree,351,45


La variable EducationField tiene orden.
_________________________ 

Evaluando la variable GENDER


Attrition,0.0,1.0
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,1494,270
Male,2205,441


La variable Gender NO tiene orden.
_________________________ 

Evaluando la variable JOBLEVEL


Attrition,0.0,1.0
JobLevel,Unnamed: 1_level_1,Unnamed: 2_level_1
Head,180,27
Intern,1377,252
Junior,1317,285
Manager,267,51
Senior,558,96


La variable JobLevel NO tiene orden.
_________________________ 

Evaluando la variable JOBROLE


Attrition,0.0,1.0
JobRole,Unnamed: 1_level_1,Unnamed: 2_level_1
Healthcare Representative,336,57
Human Resources,135,21
Laboratory Technician,651,126
Manager,264,42
Manufacturing Director,387,48
Research Director,183,57
Research Scientist,717,159
Sales Executive,813,165
Sales Representative,213,36


La variable JobRole tiene orden.
_________________________ 

Evaluando la variable MARITALSTATUS


Attrition,0.0,1.0
MaritalStatus,Unnamed: 1_level_1,Unnamed: 2_level_1
Divorced,882,99
Married,1767,252
Single,1050,360


La variable MaritalStatus tiene orden.
_________________________ 

Evaluando la variable STOCKOPTIONLEVEL


Attrition,0.0,1.0
StockOptionLevel,Unnamed: 1_level_1,Unnamed: 2_level_1
Bad,1575,318
Best,216,39
Better,390,84
Good,1518,270


La variable StockOptionLevel NO tiene orden.
_________________________ 

Evaluando la variable TRAININGTIMESLASTYEAR


Attrition,0.0,1.0
TrainingTimesLastYear,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,132,30
1.0,183,30
2.0,1359,282
3.0,1215,258
4.0,321,48
5.0,306,51
6.0,183,12


La variable TrainingTimesLastYear tiene orden.
_________________________ 

Evaluando la variable JOBINVOLVEMENT


Attrition,0.0,1.0
JobInvolvement,Unnamed: 1_level_1,Unnamed: 2_level_1
High,2205,399
Low,195,54
Medium,945,180
Very High,354,78


La variable JobInvolvement tiene orden.
_________________________ 



In [129]:
lista_categoricas

Index(['EnvironmentSatisfaction', 'JobSatisfaction', 'WorkLifeBalance',
       'BusinessTravel', 'Department', 'Education', 'EducationField', 'Gender',
       'JobLevel', 'JobRole', 'MaritalStatus', 'StockOptionLevel',
       'TrainingTimesLastYear', 'JobInvolvement'],
      dtype='object')

Definimos un diccionario con los metodos de enconding y las columnas que asignaremos a cada uno en base a si tienen orden o no.

In [130]:
diccionario_encoding = {
    "onehot": ["Gender", "Education", "JobLevel", "StockOptionLevel"],
    "dummies": [],
    "ordinal": {},
    "label": [],
    "frequency": [],
    "target": ["BusinessTravel", "Department", "JobRole", "MaritalStatus", "TrainingTimesLastYear", "EnvironmentSatisfaction", "JobSatisfaction", "WorkLifeBalance", "JobInvolvement", "EducationField"]
}

In [131]:
encoding = se.Encoding(df, diccionario_encoding, "Attrition")

In [132]:
df_one_hot = encoding.one_hot_encoding()
df_one_hot.head()

Unnamed: 0,EnvironmentSatisfaction,JobSatisfaction,WorkLifeBalance,Age,Attrition,BusinessTravel,Department,DistanceFromHome,EducationField,JobRole,MaritalStatus,MonthlyIncome,NumCompaniesWorked,PercentSalaryHike,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,JobInvolvement,Gender_Female,Gender_Male,Education_1.0,Education_2.0,Education_3.0,Education_4.0,Education_5.0,JobLevel_Head,JobLevel_Intern,JobLevel_Junior,JobLevel_Manager,JobLevel_Senior,StockOptionLevel_Bad,StockOptionLevel_Best,StockOptionLevel_Better,StockOptionLevel_Good
0,High,Very High,Good,51.0,0.0,Travel_Rarely,Sales,6.0,Life Sciences,Healthcare Representative,Married,1442.76,1.0,11.0,1.0,6.0,1.0,0.0,High,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,High,Medium,Best,31.0,1.0,Travel_Frequently,Research & Development,10.0,Life Sciences,Research Scientist,Single,460.79,0.0,23.0,6.0,3.0,5.0,1.0,Medium,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,Medium,Medium,Bad,32.0,0.0,Travel_Frequently,Research & Development,17.0,Other,Sales Executive,Married,2126.08,1.0,15.0,5.0,2.0,5.0,0.0,High,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3,Very High,Very High,Better,38.0,0.0,Non-Travel,Research & Development,2.0,Life Sciences,Human Resources,Married,915.31,3.0,11.0,13.0,5.0,8.0,7.0,Medium,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4,Very High,Low,Better,32.0,0.0,Travel_Rarely,Research & Development,10.0,Medical,Sales Executive,Single,257.62,4.0,12.0,9.0,2.0,6.0,0.0,High,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [133]:
target_encoder = TargetEncoder(cols=["BusinessTravel", "Department", "JobRole", "MaritalStatus", "TrainingTimesLastYear", "EnvironmentSatisfaction", "JobSatisfaction", "WorkLifeBalance", "JobInvolvement", "EducationField"])
df_encoded = target_encoder.fit_transform(df_one_hot, df_one_hot["Attrition"])
df_encoded.head()

Unnamed: 0,EnvironmentSatisfaction,JobSatisfaction,WorkLifeBalance,Age,Attrition,BusinessTravel,Department,DistanceFromHome,EducationField,JobRole,MaritalStatus,MonthlyIncome,NumCompaniesWorked,PercentSalaryHike,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,JobInvolvement,Gender_Female,Gender_Male,Education_1.0,Education_2.0,Education_3.0,Education_4.0,Education_5.0,JobLevel_Head,JobLevel_Intern,JobLevel_Junior,JobLevel_Manager,JobLevel_Senior,StockOptionLevel_Bad,StockOptionLevel_Best,StockOptionLevel_Better,StockOptionLevel_Good
0,0.137778,0.114119,0.167812,51.0,0.0,0.149569,0.150224,6.0,0.166667,0.145038,0.124814,1442.76,1.0,11.0,1.0,0.061538,1.0,0.0,0.153226,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.137778,0.164286,0.178414,31.0,1.0,0.249097,0.157128,10.0,0.166667,0.181507,0.255319,460.79,0.0,23.0,6.0,0.175153,5.0,1.0,0.16,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.149533,0.164286,0.313808,32.0,0.0,0.249097,0.157128,17.0,0.121951,0.168712,0.124814,2126.08,1.0,15.0,5.0,0.171846,5.0,0.0,0.153226,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3,0.134183,0.114119,0.142857,38.0,0.0,0.08,0.157128,2.0,0.166667,0.134615,0.124814,915.31,3.0,11.0,13.0,0.142857,8.0,7.0,0.16,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4,0.134183,0.22907,0.142857,32.0,0.0,0.149569,0.157128,10.0,0.161638,0.168712,0.255319,257.62,4.0,12.0,9.0,0.171846,6.0,0.0,0.153226,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [134]:
df_encoded.columns

Index(['EnvironmentSatisfaction', 'JobSatisfaction', 'WorkLifeBalance', 'Age',
       'Attrition', 'BusinessTravel', 'Department', 'DistanceFromHome',
       'EducationField', 'JobRole', 'MaritalStatus', 'MonthlyIncome',
       'NumCompaniesWorked', 'PercentSalaryHike', 'TotalWorkingYears',
       'TrainingTimesLastYear', 'YearsAtCompany', 'YearsSinceLastPromotion',
       'JobInvolvement', 'Gender_Female', 'Gender_Male', 'Education_1.0',
       'Education_2.0', 'Education_3.0', 'Education_4.0', 'Education_5.0',
       'JobLevel_Head', 'JobLevel_Intern', 'JobLevel_Junior',
       'JobLevel_Manager', 'JobLevel_Senior', 'StockOptionLevel_Bad',
       'StockOptionLevel_Best', 'StockOptionLevel_Better',
       'StockOptionLevel_Good'],
      dtype='object')

In [135]:
with open('../datos/datos_encoded.pkl', 'wb') as f:
    pickle.dump(df_encoded, f)