# Preprocessing data

Categorical columns will be encoded using One-Hot encoding.

Since distributions are not normal, we gonna use Power Transformer from Scikit-Learn to make them normal-like.

## Preparing environment

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, PowerTransformer
import sys
sys.path.append('../high_performance_employee_resign_prediction')
from utils import paths

## Importing data

In [2]:
train_df = pd.read_csv(paths.data_interim_dir('train_node.csv'))
test_df = pd.read_csv(paths.data_interim_dir('test_node.csv'))

In [3]:
id_cols = ['id_employee_employee', 'id_last_boss_employee', 'id_last_boss_boss']

num_cols = ['office_distance_employee', 'low_health_days_employee', 'average_permanence_employee', 'salary_employee',
            'performance_score_employee', 'psi_score_employee', 'join_age_employee', 'office_distance_boss',
            'low_health_days_boss', 'average_permanence_boss', 'salary_boss', 'performance_score_boss',
            'psi_score_boss', 'join_age_boss', 'office_distance_diff', 'low_health_days_diff', 'average_permanence_diff',
            'salary_diff', 'join_days_diff', 'age_diff']

cat_cols = ['seniority_employee', 'work_modality_employee', 'gender_employee', 'recruitment_channel_employee',
            'marital_estatus_employee', 'join_year_employee', 'join_month_employee', 'performance_employee',
            'work_modality_boss', 'gender_boss', 'recruitment_channel_boss', 'marital_estatus_boss',
            'join_year_boss', 'join_month_boss', 'performance_boss', 'joined_after_boss', 'younger_than_boss']

## Dropping categories according to hypothesis testing

In [4]:
reduced_train_df = train_df.drop(columns=['gender_employee', 'join_month_employee', 'work_modality_boss', 'gender_boss',
                                        'recruitment_channel_boss', 'marital_estatus_boss', 'join_year_boss', 'join_month_boss', 
                                        'performance_boss', 'joined_after_boss', 'younger_than_boss', 'office_distance_employee', 
                                        'average_permanence_employee', 'salary_employee', 'psi_score_employee', 
                                        'join_age_employee', 'office_distance_boss', 'low_health_days_boss', 'average_permanence_boss', 
                                        'salary_boss', 'performance_score_boss', 'psi_score_boss', 'join_age_boss', 
                                        'office_distance_diff', 'average_permanence_diff', 'salary_diff', 'age_diff'])

reduced_test_df = test_df.drop(columns=['gender_employee', 'join_month_employee', 'work_modality_boss', 'gender_boss',
                                        'recruitment_channel_boss', 'marital_estatus_boss', 'join_year_boss', 'join_month_boss', 
                                        'performance_boss', 'joined_after_boss', 'younger_than_boss', 'office_distance_employee', 
                                        'average_permanence_employee', 'salary_employee', 'psi_score_employee', 
                                        'join_age_employee', 'office_distance_boss', 'low_health_days_boss', 'average_permanence_boss', 
                                        'salary_boss', 'performance_score_boss', 'psi_score_boss', 'join_age_boss', 
                                        'office_distance_diff', 'average_permanence_diff', 'salary_diff', 'age_diff'])

In [8]:
reduced_train_df.iloc[:, :-193]

Unnamed: 0,id_employee_employee,id_last_boss_employee,seniority_employee,work_modality_employee,low_health_days_employee,recruitment_channel_employee,performance_score_employee,marital_estatus_employee,join_year_employee,performance_employee,id_last_boss_boss,low_health_days_diff,join_days_diff
0,100247,102074,1,Híbrida,1,Ferias & Networking,99.0,Soltero,2018,high,102028,-1,-757
1,103355,102115,1,Híbrida,2,Ferias & Networking,99.0,Soltero,2021,high,102116,0,-940
2,100669,102060,1,Híbrida,3,Referidos,96.0,Viudo,2016,high,102041,0,-990
3,103760,102062,1,Híbrida,2,Linkedin,96.0,Soltero,2014,high,102149,0,72
4,100965,102062,1,Híbrida,2,Linkedin,95.0,Soltero,2014,high,102149,0,71
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2147,103567,102171,1,Presencial,8,Portal Web,80.0,Casado,2023,high,102088,-4,-2270
2148,104098,102172,1,Presencial,12,Linkedin,80.0,Casado,2023,high,102020,-11,-2650
2149,103987,102155,1,Presencial,1,Portal Web,80.0,Soltero,2023,high,102159,0,-2457
2150,103810,102141,1,Presencial,2,Portal Web,80.0,Casado,2023,high,102134,-1,-1923


In [42]:
reduced_num_cols = ['low_health_days_employee', 'performance_score_employee', 'low_health_days_diff', 'join_days_diff']

reduced_cat_cols = ['seniority_employee', 'work_modality_employee', 'recruitment_channel_employee', 'marital_estatus_employee',
                    'join_year_employee', 'performance_employee']

## Preprocessing full data

In [10]:
ohe_full = OneHotEncoder(sparse_output=False, drop='first')
pt_full = PowerTransformer()

In [11]:
# Encoding categories

# Train
encoded_full_train = pd.DataFrame(ohe_full.fit_transform(train_df[cat_cols]))
encoded_full_train.columns = ohe_full.get_feature_names_out(cat_cols)

# Test
encoded_full_test = pd.DataFrame(ohe_full.transform(test_df[cat_cols]))
encoded_full_test.columns = ohe_full.get_feature_names_out(cat_cols)

In [12]:
# Scaling categories

# Train
scaled_full_train = pd.DataFrame(pt_full.fit_transform(train_df[num_cols]), columns=num_cols)

# Test
scaled_full_test = pd.DataFrame(pt_full.transform(test_df[num_cols]), columns=num_cols)

In [28]:
# Concatenating dfs

concat_full_train = pd.concat([encoded_full_train, scaled_full_train], axis=1)

concat_full_test = pd.concat([encoded_full_test, scaled_full_test], axis=1)

### Feature engineering

Autofeat is a library that applies multiple numerical transformations to the features in order to find meaningful relationships between features and target, improving the prediction

In [15]:
from autofeat import AutoFeatClassifier

In [16]:
y = train_df['resign']

In [17]:
autofeat = AutoFeatClassifier(n_jobs=-1, verbose=1)

X_full_transformed = autofeat.fit_transform(concat_full_train, y)

2024-07-26 09:01:54,560 INFO: [AutoFeat] The 2 step feature engineering process could generate up to 185745 features.
2024-07-26 09:01:54,560 INFO: [AutoFeat] With 2152 data points this new feature matrix would use about 1.60 gb of space.
2024-07-26 09:01:54,565 INFO: [feateng] Step 1: transformation of original features


[feateng]               0/             87 features transformed

2024-07-26 09:02:00,724 INFO: [feateng] Generated 100 transformed features from 87 original features - done.
2024-07-26 09:02:00,767 INFO: [feateng] Step 2: first combination of features


[feateng]           17000/          17391 feature tuples combined

2024-07-26 09:02:08,715 INFO: [feateng] Generated 16969 feature combinations from 17391 original feature tuples - done.


[feateng]           17300/          17391 feature tuples combined

2024-07-26 09:02:08,904 INFO: [feateng] Generated altogether 17412 new features in 2 steps
2024-07-26 09:02:08,905 INFO: [feateng] Removing correlated features, as well as additions at the highest level
2024-07-26 09:02:10,009 INFO: [feateng] Generated a total of 16677 additional features


[featsel] Scaling data...done.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed: 58.9min
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed: 59.0min remaining: 88.5min
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed: 59.1min remaining: 39.4min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 68.4min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 68.4min finished


2024-07-26 10:10:39,450 INFO: [featsel] 21 features after 5 feature selection runs
2024-07-26 10:10:39,779 INFO: [featsel] 21 features after correlation filtering
2024-07-26 10:10:40,774 INFO: [featsel] 21 features after noise filtering
2024-07-26 10:10:40,786 INFO: [AutoFeat] Computing 19 new features.


[AutoFeat]    18/   19 new features

2024-07-26 10:10:43,086 INFO: [AutoFeat]    19/   19 new features ...done.
2024-07-26 10:10:43,090 INFO: [AutoFeat] Final dataframe with 106 feature columns (19 new).
2024-07-26 10:10:43,090 INFO: [AutoFeat] Training final classification model.
2024-07-26 10:10:43,524 INFO: [AutoFeat] Trained model: largest coefficients:
2024-07-26 10:10:43,524 INFO: [-1.17411079]
2024-07-26 10:10:43,525 INFO: 5.037028 * gender_boss_Mujer*gender_employee_Mujer
2024-07-26 10:10:43,526 INFO: 3.330279 * join_month_employee_8*join_year_employee_2023
2024-07-26 10:10:43,526 INFO: 2.841055 * seniority_employee_2*Abs(performance_score_boss)
2024-07-26 10:10:43,526 INFO: 1.680698 * gender_boss_Mujer
2024-07-26 10:10:43,527 INFO: 1.267215 * gender_employee_Mujer
2024-07-26 10:10:43,527 INFO: 1.172442 * join_month_employee_6*join_year_boss_2017
2024-07-26 10:10:43,528 INFO: 1.144982 * join_year_employee_2019*Abs(office_distance_diff)
2024-07-26 10:10:43,528 INFO: 0.919206 * gender_employee_Mujer*marital_estatus_

In [29]:
X_full_transformed_test = autofeat.transform(concat_full_test)

2024-07-26 10:33:37,910 INFO: [AutoFeat] Computing 19 new features.
2024-07-26 10:33:37,927 INFO: [AutoFeat]    19/   19 new features ...done.


[AutoFeat]    18/   19 new features

### Joining full data

In [30]:
# Dropping categories and numerical columns

# Train
train_drop_df = train_df.drop(columns=cat_cols)
train_drop_df = train_drop_df.drop(columns=num_cols)

# Test
test_drop_df = test_df.drop(columns=cat_cols)
test_drop_df = test_drop_df.drop(columns=num_cols)

# Concatenating columns

# Train
train_full_df = pd.concat([train_drop_df, X_full_transformed], axis=1)

# Test
test_full_df = pd.concat([test_drop_df, X_full_transformed_test], axis=1)

## Saving full data

In [32]:
train_full_df.to_csv(paths.data_processed_dir('train_processed.csv'), index=False, sep=',')
test_full_df.to_csv(paths.data_processed_dir('test_processed.csv'), index=False, sep=',')

### Obtaining generated columns

In [38]:
# Obtaining generated columns

generated_train = X_full_transformed.iloc[:, -19:]
generated_test = X_full_transformed_test.iloc[:, -19:]

## Preprocessing reduced data

In [39]:
ohe_reduced = OneHotEncoder(sparse_output=False, drop='first')
pt_reduced = PowerTransformer()

In [40]:
# Encoding categories

# Train
encoded_reduced_train = pd.DataFrame(ohe_reduced.fit_transform(train_df[reduced_cat_cols]))
encoded_reduced_train.columns = ohe_reduced.get_feature_names_out(reduced_cat_cols)

# Test
encoded_reduced_test = pd.DataFrame(ohe_reduced.transform(test_df[reduced_cat_cols]))
encoded_reduced_test.columns = ohe_reduced.get_feature_names_out(reduced_cat_cols)

In [43]:
# Scaling categories

# Train
scaled_reduced_train = pd.DataFrame(pt_reduced.fit_transform(train_df[reduced_num_cols]), columns=reduced_num_cols)

# Test
scaled_reduced_test = pd.DataFrame(pt_reduced.transform(test_df[reduced_num_cols]), columns=reduced_num_cols)

In [44]:
# Concatenating dfs

concat_red_train = pd.concat([encoded_reduced_train, scaled_reduced_train], axis=1)

concat_red_test = pd.concat([encoded_reduced_test, scaled_reduced_test], axis=1)

## Feature Engineering

In [47]:
autofeat_red = AutoFeatClassifier(n_jobs=-1, verbose=2)

X_red_transformed = autofeat_red.fit_transform(concat_red_train, y)

2024-07-26 10:47:28,657 INFO: [AutoFeat] The 2 step feature engineering process could generate up to 15400 features.
2024-07-26 10:47:28,659 INFO: [AutoFeat] With 2152 data points this new feature matrix would use about 0.13 gb of space.
2024-07-26 10:47:28,660 INFO: [feateng] Step 1: transformation of original features


[feateng]               0/             25 features transformed

2024-07-26 10:47:30,053 INFO: [feateng] Generated 20 transformed features from 25 original features - done.
2024-07-26 10:47:30,055 INFO: [feateng] Step 2: first combination of features


[feateng]             900/            990 feature tuples combined

2024-07-26 10:47:30,772 INFO: [feateng] Generated 906 feature combinations from 990 original feature tuples - done.
2024-07-26 10:47:30,778 INFO: [feateng] Generated altogether 995 new features in 2 steps
2024-07-26 10:47:30,779 INFO: [feateng] Removing correlated features, as well as additions at the highest level
2024-07-26 10:47:30,843 INFO: [feateng] Generated a total of 895 additional features


[featsel] Scaling data...done.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  4.0min remaining:  6.1min
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:  4.1min remaining:  2.7min


2024-07-26 10:52:18,340 INFO: [featsel] 53 features after 5 feature selection runs
2024-07-26 10:52:18,357 INFO: [featsel] 53 features after correlation filtering


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  4.8min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  4.8min finished


2024-07-26 10:52:27,361 INFO: [featsel] 17 features after noise filtering
2024-07-26 10:52:27,363 INFO: [AutoFeat] Computing 13 new features.


[AutoFeat]    12/   13 new features

2024-07-26 10:52:29,490 INFO: [AutoFeat]    13/   13 new features ...done.
2024-07-26 10:52:29,492 INFO: [AutoFeat] Final dataframe with 38 feature columns (13 new).
2024-07-26 10:52:29,494 INFO: [AutoFeat] Training final classification model.
2024-07-26 10:52:29,782 INFO: [AutoFeat] Trained model: largest coefficients:
2024-07-26 10:52:29,783 INFO: [0.06720006]
2024-07-26 10:52:29,783 INFO: 0.030004 * exp(performance_score_employee)*Abs(performance_score_employee)
2024-07-26 10:52:29,784 INFO: 0.026251 * join_days_diff**2*low_health_days_employee**3
2024-07-26 10:52:29,785 INFO: 0.024841 * performance_score_employee
2024-07-26 10:52:29,785 INFO: 0.019236 * low_health_days_employee**2*exp(performance_score_employee)
2024-07-26 10:52:29,786 INFO: 0.015548 * performance_score_employee**3*recruitment_channel_employee_PortalWeb
2024-07-26 10:52:29,786 INFO: 0.012133 * join_days_diff**3*performance_employee_low
2024-07-26 10:52:29,787 INFO: 0.007564 * join_year_employee_2016/join_days_diff


In [49]:
X_red_transformed_test = autofeat_red.transform(concat_red_test)

2024-07-26 10:55:10,920 INFO: [AutoFeat] Computing 13 new features.
2024-07-26 10:55:10,931 INFO: [AutoFeat]    13/   13 new features ...done.


[AutoFeat]    12/   13 new features

### Joining reduced data

In [52]:
# Concatenating columns

# Train
train_red_df = pd.concat([train_drop_df, X_red_transformed], axis=1)

# Test
test_red_df = pd.concat([test_drop_df, X_red_transformed_test], axis=1)

## Saving reduced data

In [55]:
train_red_df.to_csv(paths.data_processed_dir('train_reduced_processed.csv'), index=False, sep=',')
test_red_df.to_csv(paths.data_processed_dir('test_reduced_processed.csv'), index=False, sep=',')

### Generating a df with reduced features and adding obtained features from full data autofeat

In [56]:
train_full_red_df = pd.concat([train_drop_df, concat_red_train, generated_train], axis=1)
test_full_red_df = pd.concat([test_drop_df, concat_red_test, generated_test], axis=1)

In [59]:
# Saving these dataframes

train_full_red_df.to_csv(paths.data_processed_dir('train_full_red_processed.csv'), index=False, sep=',')
test_full_red_df.to_csv(paths.data_processed_dir('test_full_red_processed.csv'), index=False, sep=',')