# Preprocessing data

Categorical columns will be encoded using One-Hot encoding.

Since distributions are not normal, we gonna use Power Transformer from Scikit-Learn to make them normal-like.

## Preparing environment

In [11]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import sys
sys.path.append('../high_performance_employee_resign_prediction')
from utils import paths

## Importing data

In [12]:
train_df = pd.read_csv(paths.data_interim_dir('train_node.csv'))
test_df = pd.read_csv(paths.data_interim_dir('test_node.csv'))

In [13]:
id_cols = ['id_employee', 'id_last_boss', 'id_last_boss']

num_cols = ['office_distance', 'low_health_days', 'average_permanence', 'salary', 'performance_score', 
            'psi_score', 'join_age', 'office_distance_boss', 'low_health_days_boss', 'average_permanence_boss', 
            'salary_boss', 'performance_score_boss', 'psi_score_boss', 'join_age_boss', 'salary_diff',
            'join_days_diff', 'age_diff', 'avg_od_epb', 'avg_lhd_epb', 'avg_avgp_epb', 
            'avg_sal_epb', 'avg_ps_epb', 'avg_psis_epb', 'avg_ja_epb', 'boss_employees_in_charge']

cat_cols = ['seniority', 'work_modality', 'gender', 'recruitment_channel', 'marital_estatus', 
            'join_year', 'join_month', 'performance', 'join_age_group', 'work_modality_boss', 
            'gender_boss', 'recruitment_channel_boss', 'marital_estatus_boss', 'join_year_boss', 'join_month_boss', 
            'performance_boss', 'join_age_group_boss', 'joined_after_boss', 'younger_than_boss']

## Dropping categories according to hypothesis testing

In [14]:
reduced_train_df = train_df.drop(columns=['join_month', 'work_modality_boss', 'gender_boss', 'recruitment_channel_boss', 
                                          'marital_estatus_boss', 'join_year_boss', 'join_month_boss', 'join_age_group_boss', 
                                          'younger_than_boss', 'office_distance', 'average_permanence', 'salary', 
                                          'psi_score', 'join_age', 'office_distance_boss', 'average_permanence_boss', 
                                          'salary_boss', 'psi_score_boss', 'join_age_boss', 'age_diff', 
                                          'avg_od_epb', 'avg_lhd_epb', 'avg_sal_epb', 'avg_psis_epb', 
                                          'boss_employees_in_charge'])

reduced_test_df = test_df.drop(columns=['join_month', 'work_modality_boss', 'gender_boss', 'recruitment_channel_boss', 
                                          'marital_estatus_boss', 'join_year_boss', 'join_month_boss', 'join_age_group_boss', 
                                          'younger_than_boss', 'office_distance', 'average_permanence', 'salary', 
                                          'psi_score', 'join_age', 'office_distance_boss', 'average_permanence_boss', 
                                          'salary_boss', 'psi_score_boss', 'join_age_boss', 'age_diff', 
                                          'avg_od_epb', 'avg_lhd_epb', 'avg_sal_epb', 'avg_psis_epb', 
                                          'boss_employees_in_charge'])

In [15]:
reduced_train_df.iloc[:, :-129]

Unnamed: 0,id_employee,id_last_boss,seniority,work_modality,low_health_days,gender,recruitment_channel,performance_score,marital_estatus,join_year,...,join_age_group,low_health_days_boss,performance_score_boss,performance_boss,salary_diff,join_days_diff,joined_after_boss,avg_avgp_epb,avg_ps_epb,avg_ja_epb
0,100247,102074,1,Híbrida,1,Mujer,Ferias & Networking,99,Soltero,2018,...,young,0,80,high,1475989,-757,1,5.645161,74.096774,31.225806
1,103355,102115,1,Híbrida,2,Hombre,Ferias & Networking,99,Soltero,2021,...,young,2,94,high,1433226,-940,1,7.148936,73.340426,35.574468
2,100669,102060,1,Híbrida,3,Mujer,Referidos,96,Viudo,2016,...,adult,3,82,high,933894,-990,1,8.275862,76.034483,34.379310
3,103760,102062,1,Híbrida,2,Hombre,Linkedin,96,Soltero,2014,...,young,2,38,low,421768,72,0,4.588235,77.647059,29.176471
4,100965,102062,1,Híbrida,2,Hombre,Linkedin,95,Soltero,2014,...,young,2,38,low,425196,71,0,4.588235,77.647059,29.176471
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2147,103567,102171,1,Presencial,8,Mujer,Portal Web,80,Casado,2023,...,adult,4,80,high,1334841,-2270,1,6.610169,82.542373,42.457627
2148,104098,102172,1,Presencial,12,Mujer,Linkedin,71,Casado,2023,...,adult,1,57,low,588663,-2650,1,6.523810,75.936508,39.063492
2149,103987,102155,1,Presencial,1,Hombre,Portal Web,77,Soltero,2023,...,adult,1,39,low,-43879,-2457,1,6.692308,64.153846,35.230769
2150,103810,102141,1,Presencial,2,Hombre,Portal Web,77,Casado,2023,...,adult,1,64,low,306923,-1923,1,7.629630,77.444444,41.222222


In [16]:
reduced_num_cols = ['low_health_days', 'performance_score', 'low_health_days_boss', 'performance_score_boss', 
                    'salary_diff', 'join_days_diff', 'avg_avgp_epb', 'avg_ps_epb', 'avg_ja_epb']

reduced_cat_cols = ['seniority', 'work_modality', 'gender', 'recruitment_channel', 'marital_estatus',
                    'join_year', 'performance', 'join_age_group', 'performance_boss', 'joined_after_boss']

## Preprocessing full data

In [19]:
ohe_full = OneHotEncoder(sparse_output=False, drop='first')
ss_full = StandardScaler()

In [20]:
# Encoding categories

# Train
encoded_full_train = pd.DataFrame(ohe_full.fit_transform(train_df[cat_cols]))
encoded_full_train.columns = ohe_full.get_feature_names_out(cat_cols)

# Test
encoded_full_test = pd.DataFrame(ohe_full.transform(test_df[cat_cols]))
encoded_full_test.columns = ohe_full.get_feature_names_out(cat_cols)

In [21]:
# Scaling categories

# Train
scaled_full_train = pd.DataFrame(ss_full.fit_transform(train_df[num_cols]), columns=num_cols)

# Test
scaled_full_test = pd.DataFrame(ss_full.transform(test_df[num_cols]), columns=num_cols)

In [22]:
# Concatenating dfs

concat_full_train = pd.concat([encoded_full_train, scaled_full_train], axis=1)

concat_full_test = pd.concat([encoded_full_test, scaled_full_test], axis=1)

## Preprocessing full data

In [None]:
ohe_full = OneHotEncoder(sparse_output=False, drop='first')
ss_full = StandardScaler()

In [None]:
# Encoding categories

# Train
encoded_full_train = pd.DataFrame(ohe_full.fit_transform(train_df[cat_cols]))
encoded_full_train.columns = ohe_full.get_feature_names_out(cat_cols)

# Test
encoded_full_test = pd.DataFrame(ohe_full.transform(test_df[cat_cols]))
encoded_full_test.columns = ohe_full.get_feature_names_out(cat_cols)

In [None]:
# Scaling categories

# Train
scaled_full_train = pd.DataFrame(ss_full.fit_transform(train_df[num_cols]), columns=num_cols)

# Test
scaled_full_test = pd.DataFrame(ss_full.transform(test_df[num_cols]), columns=num_cols)

In [None]:
# Concatenating dfs

concat_full_train = pd.concat([encoded_full_train, scaled_full_train], axis=1)

concat_full_test = pd.concat([encoded_full_test, scaled_full_test], axis=1)

### Feature engineering

Autofeat is a library that applies multiple numerical transformations to the features in order to find meaningful relationships between features and target, improving the prediction

In [23]:
from autofeat import AutoFeatClassifier

In [25]:
y = train_df['resign']

In [32]:
feateng_cols = ['seniority_2',
 'work_modality_Presencial',
 'gender_Mujer',
 'recruitment_channel_Headhunter',
 'recruitment_channel_Linkedin',
 'recruitment_channel_Portal Web',
 'recruitment_channel_Referidos',
 'marital_estatus_Divorciado',
 'marital_estatus_Soltero',
 'marital_estatus_Viudo',
 'performance_low',
 'join_age_group_old_adult',
 'join_age_group_young',
 'work_modality_boss_Presencial',
 'gender_boss_Mujer',
 'recruitment_channel_boss_Headhunter',
 'recruitment_channel_boss_Linkedin',
 'recruitment_channel_boss_Portal Web',
 'recruitment_channel_boss_Referidos',
 'marital_estatus_boss_Divorciado',
 'marital_estatus_boss_Soltero',
 'marital_estatus_boss_Viudo',
 'performance_boss_low',
 'join_age_group_boss_old_adult',
 'join_age_group_boss_young',
 'joined_after_boss_1',
 'younger_than_boss_1',
 'office_distance',
 'low_health_days',
 'average_permanence',
 'salary',
 'performance_score',
 'psi_score',
 'join_age',
 'office_distance_boss',
 'low_health_days_boss',
 'average_permanence_boss',
 'salary_boss',
 'performance_score_boss',
 'psi_score_boss',
 'join_age_boss',
 'salary_diff',
 'join_days_diff',
 'age_diff',
 'boss_employees_in_charge']

In [33]:
autofeat = AutoFeatClassifier(n_jobs=-1, verbose=1, feateng_cols=feateng_cols)

X_full_transformed = autofeat.fit_transform(concat_full_train, y)

2024-09-09 00:37:38,927 INFO: [AutoFeat] The 2 step feature engineering process could generate up to 49770 features.
2024-09-09 00:37:38,928 INFO: [AutoFeat] With 2152 data points this new feature matrix would use about 0.43 gb of space.
2024-09-09 00:37:38,930 INFO: [feateng] Step 1: transformation of original features


[feateng]               0/             45 features transformed

2024-09-09 00:37:46,754 INFO: [feateng] Generated 88 transformed features from 45 original features - done.
2024-09-09 00:37:46,758 INFO: [feateng] Step 2: first combination of features


[feateng]            8600/           8778 feature tuples combined

2024-09-09 00:38:08,059 INFO: [feateng] Generated 8629 feature combinations from 8778 original feature tuples - done.
2024-09-09 00:38:08,153 INFO: [feateng] Generated altogether 8795 new features in 2 steps
2024-09-09 00:38:08,155 INFO: [feateng] Removing correlated features, as well as additions at the highest level


[feateng]            8700/           8778 feature tuples combined

2024-09-09 00:38:09,004 INFO: [feateng] Generated a total of 8166 additional features


[featsel] Scaling data...done.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed: 32.0min
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed: 32.3min remaining: 48.4min
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed: 33.1min remaining: 22.1min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 35.9min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 35.9min finished


2024-09-09 01:14:06,314 INFO: [featsel] 62 features after 5 feature selection runs
2024-09-09 01:14:06,396 INFO: [featsel] 62 features after correlation filtering
2024-09-09 01:14:13,369 INFO: [featsel] 30 features after noise filtering
2024-09-09 01:14:13,375 INFO: [AutoFeat] Computing 25 new features.


[AutoFeat]    23/   25 new features

2024-09-09 01:14:16,410 INFO: [AutoFeat]    25/   25 new features ...done.
2024-09-09 01:14:16,413 INFO: [AutoFeat] Final dataframe with 121 feature columns (25 new).
2024-09-09 01:14:16,414 INFO: [AutoFeat] Training final classification model.


[AutoFeat]    24/   25 new features

2024-09-09 01:14:17,479 INFO: [AutoFeat] Trained model: largest coefficients:
2024-09-09 01:14:17,480 INFO: [-1.28320283]
2024-09-09 01:14:17,481 INFO: 5.693560 * gender_Mujer*gender_boss_Mujer
2024-09-09 01:14:17,482 INFO: 2.755608 * seniority_2
2024-09-09 01:14:17,482 INFO: 2.227825 * gender_Mujer
2024-09-09 01:14:17,482 INFO: 1.866277 * gender_boss_Mujer
2024-09-09 01:14:17,483 INFO: 1.207835 * seniority_2*Abs(psi_score_boss)
2024-09-09 01:14:17,483 INFO: 1.188418 * join_year_2019
2024-09-09 01:14:17,484 INFO: 0.802443 * gender_Mujer*marital_estatus_Soltero
2024-09-09 01:14:17,485 INFO: 0.656138 * marital_estatus_Divorciado*Abs(psi_score)
2024-09-09 01:14:17,486 INFO: 0.628426 * gender_Mujer*recruitment_channel_PortalWeb
2024-09-09 01:14:17,486 INFO: 0.572402 * work_modality_Presencial*work_modality_boss_Presencial
2024-09-09 01:14:17,486 INFO: 0.525835 * gender_boss_Mujer*Abs(join_age_boss)
2024-09-09 01:14:17,487 INFO: 0.487760 * average_permanence*recruitment_channel_Referidos
20

In [34]:
X_full_transformed_test = autofeat.transform(concat_full_test)

2024-09-09 01:14:17,525 INFO: [AutoFeat] Computing 25 new features.
2024-09-09 01:14:17,549 INFO: [AutoFeat]    25/   25 new features ...done.


[AutoFeat]    24/   25 new features

### Joining full data

In [35]:
# Dropping categories and numerical columns

# Train
train_drop_df = train_df.drop(columns=cat_cols)
train_drop_df = train_drop_df.drop(columns=num_cols)

# Test
test_drop_df = test_df.drop(columns=cat_cols)
test_drop_df = test_drop_df.drop(columns=num_cols)

# Concatenating columns

# Train
train_full_df = pd.concat([train_drop_df, X_full_transformed], axis=1)

# Test
test_full_df = pd.concat([test_drop_df, X_full_transformed_test], axis=1)

## Saving full data

In [36]:
train_full_df.to_csv(paths.data_processed_dir('train_processed.csv'), index=False, sep=',')
test_full_df.to_csv(paths.data_processed_dir('test_processed.csv'), index=False, sep=',')

### Obtaining generated columns

In [37]:
# Obtaining generated columns
generated_cols = [col for col in X_full_transformed.columns.tolist() if col not in concat_full_train.columns.tolist()]
generated_train = X_full_transformed[generated_cols]
generated_test = X_full_transformed_test[generated_cols]

### Generating a df with reduced features and adding obtained features from full data autofeat

In [43]:
ohe_red = OneHotEncoder(sparse_output=False, drop='first')
ss_red = StandardScaler()

In [44]:
# Encoding categories

# Train
encoded_red_train = pd.DataFrame(ohe_red.fit_transform(train_df[reduced_cat_cols]))
encoded_red_train.columns = ohe_red.get_feature_names_out(reduced_cat_cols)

# Test
encoded_red_test = pd.DataFrame(ohe_red.transform(test_df[reduced_cat_cols]))
encoded_red_test.columns = ohe_red.get_feature_names_out(reduced_cat_cols)

In [45]:
# Scaling categories

# Train
scaled_red_train = pd.DataFrame(ss_red.fit_transform(train_df[reduced_num_cols]), columns=reduced_num_cols)

# Test
scaled_red_test = pd.DataFrame(ss_red.transform(test_df[reduced_num_cols]), columns=reduced_num_cols)

In [46]:
# Concatenating dfs

concat_red_train = pd.concat([encoded_red_train, scaled_red_train], axis=1)

concat_red_test = pd.concat([encoded_red_test, scaled_red_test], axis=1)

In [47]:
train_red_df = pd.concat([train_drop_df, concat_red_train, generated_train], axis=1)
test_red_df = pd.concat([test_drop_df, concat_red_test, generated_test], axis=1)

In [50]:
# Saving these dataframes

train_red_df.to_csv(paths.data_processed_dir('train_red_processed.csv'), index=False, sep=',')
test_red_df.to_csv(paths.data_processed_dir('test_red_processed.csv'), index=False, sep=',')