In [28]:
# Data processing  
# -----------------------------------------------------------------------
import pandas as pd
import numpy as np
import pickle

# Pandas options  
# -----------------------------------------------------------------------
pd.options.display.max_colwidth = None

# Path configuration for custom module imports
# -----------------------------------------------------------------------
import sys
sys.path.append('../')  # Adds the parent directory to the path for custom module imports

# Ignore warnings  
# -----------------------------------------------------------------------
import warnings
warnings.filterwarnings("ignore")

# Custom functions and classes
# -----------------------------------------------------------------------
from src.support_encoding import Encoding, chi2_test
from src.support_scaling import scale_df
from src.support_eda import value_counts

## Data loading

In [14]:
df = pd.read_csv('../data/output/complete_data_imputed.csv', index_col=0).reset_index(drop=True)

In [15]:
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,Gender,JobLevel,JobRole,...,PercentSalaryHike,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,EnvironmentSatisfaction,JobSatisfaction,WorkLifeBalance,JobInvolvement
0,51,No,Travel_Rarely,Sales,6,2,Life Sciences,Female,1,Healthcare Representative,...,11,0,1.0,6,1,0,3.0,4.0,2.0,3
1,31,Yes,Travel_Frequently,Research & Development,10,1,Life Sciences,Female,1,Research Scientist,...,23,1,6.0,3,5,1,3.0,2.0,4.0,2
2,32,No,Travel_Frequently,Research & Development,17,4,Other,Male,4,Sales Executive,...,15,3,5.0,2,5,0,2.0,2.0,1.0,3
3,38,No,Non-Travel,Research & Development,2,5,Life Sciences,Male,3,Human Resources,...,11,3,13.0,5,8,7,4.0,4.0,3.0,2
4,32,No,Travel_Rarely,Research & Development,10,1,Medical,Male,1,Sales Executive,...,12,2,9.0,2,6,0,4.0,1.0,3.0,3


## Chi2 test

In [16]:
catgs = df.select_dtypes(include=['O', 'category']).columns

In [17]:
chi2_test(df,  catgs, 'Attrition', show=True)

We are evaluating the variable ATTRITION


Attrition,No,Yes
Attrition,Unnamed: 1_level_1,Unnamed: 2_level_1
No,3699,0
Yes,0,711


For the category ATTRITION there are significant differences, p = 0.0000


Attrition,No,Yes
Attrition,Unnamed: 1_level_1,Unnamed: 2_level_1
No,3103.0,596.0
Yes,596.0,115.0


--------------------------
We are evaluating the variable BUSINESSTRAVEL


Attrition,No,Yes
BusinessTravel,Unnamed: 1_level_1,Unnamed: 2_level_1
Non-Travel,414,36
Travel_Frequently,624,207
Travel_Rarely,2661,468


For the category BUSINESSTRAVEL there are significant differences, p = 0.0000


Attrition,No,Yes
BusinessTravel,Unnamed: 1_level_1,Unnamed: 2_level_1
Non-Travel,377.0,73.0
Travel_Frequently,697.0,134.0
Travel_Rarely,2625.0,504.0


--------------------------
We are evaluating the variable DEPARTMENT


Attrition,No,Yes
Department,Unnamed: 1_level_1,Unnamed: 2_level_1
Human Resources,132,57
Research & Development,2430,453
Sales,1137,201


For the category DEPARTMENT there are significant differences, p = 0.0000


Attrition,No,Yes
Department,Unnamed: 1_level_1,Unnamed: 2_level_1
Human Resources,159.0,30.0
Research & Development,2418.0,465.0
Sales,1122.0,216.0


--------------------------
We are evaluating the variable EDUCATIONFIELD


Attrition,No,Yes
EducationField,Unnamed: 1_level_1,Unnamed: 2_level_1
Human Resources,48,33
Life Sciences,1515,303
Marketing,402,75
Medical,1167,225
Other,216,30
Technical Degree,351,45


For the category EDUCATIONFIELD there are significant differences, p = 0.0000


Attrition,No,Yes
EducationField,Unnamed: 1_level_1,Unnamed: 2_level_1
Human Resources,68.0,13.0
Life Sciences,1525.0,293.0
Marketing,400.0,77.0
Medical,1168.0,224.0
Other,206.0,40.0
Technical Degree,332.0,64.0


--------------------------
We are evaluating the variable GENDER


Attrition,No,Yes
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,1494,270
Male,2205,441


For the category GENDER there are NO significant differences, p = 0.2453

--------------------------
We are evaluating the variable JOBROLE


Attrition,No,Yes
JobRole,Unnamed: 1_level_1,Unnamed: 2_level_1
Healthcare Representative,336,57
Human Resources,135,21
Laboratory Technician,651,126
Manager,264,42
Manufacturing Director,387,48
Research Director,183,57
Research Scientist,717,159
Sales Executive,813,165
Sales Representative,213,36


For the category JOBROLE there are significant differences, p = 0.0015


Attrition,No,Yes
JobRole,Unnamed: 1_level_1,Unnamed: 2_level_1
Healthcare Representative,330.0,63.0
Human Resources,131.0,25.0
Laboratory Technician,652.0,125.0
Manager,257.0,49.0
Manufacturing Director,365.0,70.0
Research Director,201.0,39.0
Research Scientist,735.0,141.0
Sales Executive,820.0,158.0
Sales Representative,209.0,40.0


--------------------------
We are evaluating the variable MARITALSTATUS


Attrition,No,Yes
MaritalStatus,Unnamed: 1_level_1,Unnamed: 2_level_1
Divorced,882,99
Married,1767,252
Single,1050,360


For the category MARITALSTATUS there are significant differences, p = 0.0000


Attrition,No,Yes
MaritalStatus,Unnamed: 1_level_1,Unnamed: 2_level_1
Divorced,823.0,158.0
Married,1693.0,326.0
Single,1183.0,227.0


--------------------------


## Encoding

Lo primero que vamos a hacer es convertir la target varible, `Attrition`, a numérica de la siguiente forma:

* `No`: `0`

* `Yes`: `1`

In [18]:
df['Attrition'].replace({'Yes': 1, 'No': 0}, inplace=True)

In [19]:
df.select_dtypes(include=['O', 'category']).head()

Unnamed: 0,BusinessTravel,Department,EducationField,Gender,JobRole,MaritalStatus
0,Travel_Rarely,Sales,Life Sciences,Female,Healthcare Representative,Married
1,Travel_Frequently,Research & Development,Life Sciences,Female,Research Scientist,Single
2,Travel_Frequently,Research & Development,Other,Male,Sales Executive,Married
3,Non-Travel,Research & Development,Life Sciences,Male,Human Resources,Married
4,Travel_Rarely,Research & Development,Medical,Male,Sales Executive,Single


Hay diferencias significativas en todas las variables salvo en `Gender`, por lo que usaremos `TargetEncoding` en todas salvo en género, que utilizaremos `OneHot`

In [20]:
encoding_methods = {"onehot": ['Gender'],
                    "target": ['BusinessTravel', 'Department', 'EducationField', 'JobRole', 'MaritalStatus'],
                    "ordinal" : {},
                    "frequency": []
                    }

encoder = Encoding(df, encoding_methods, 'Attrition')

In [21]:
df_encoded = encoder.execute_all_encodings()

### Scaling

Como hemos utilizado `OneHotEncoder` y `TargetEncoder` hace target a la variable binaria `Attrition` vale la pena comenzar probando a utilizar un `MinMaxScaler` ya que tampoco ha parecido haber demasiados outliers.

In [25]:
df_scaled, scaler = scale_df(df_encoded, df_encoded.columns.to_list(), method="minmax")

### Imbalanced data

In [29]:
value_counts(df, 'Attrition')

The number of unique values for this category is 2


Unnamed: 0_level_0,count,proportion
Attrition,Unnamed: 1_level_1,Unnamed: 2_level_1
0,3699,0.84
1,711,0.16


Nuestro conjunto de datos está bastante desbalanceado, por lo que en futuros modelos podríamos considerar un rebalanceo mediante remuestreo

### Save data

In [27]:
df_scaled.to_csv('../data/output/complete_data_preprocessed.csv')