# Feature engineering

There are libraries to make auto feature engineering. In this case, we gonna use AutoFeat that is a powerful tool to make feature engineering. It requires preprocessing before using it. 

## Preparing environment

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from autofeat import AutoFeatClassifier
import sys
sys.path.append('../high_performance_employee_resign_prediction')
from utils import paths

## Importing data

In [2]:
train_df = pd.read_csv(paths.data_interim_dir('train_node.csv'))
test_df = pd.read_csv(paths.data_interim_dir('test_node.csv'))

In [3]:
num_cols = ['office_distance', 'low_health_days', 'average_permanence', 'salary', 'performance_score', 
            'psi_score', 'join_age', 'office_distance_boss', 'low_health_days_boss', 'average_permanence_boss', 
            'salary_boss', 'performance_score_boss', 'psi_score_boss', 'join_age_boss', 'salary_diff',
            'join_days_diff', 'age_diff', 'avg_od_epb', 'avg_lhd_epb', 'avg_avgp_epb', 
            'avg_sal_epb', 'avg_ps_epb', 'avg_psis_epb', 'avg_ja_epb', 'boss_employees_in_charge']

cat_cols = ['seniority', 'work_modality', 'gender', 'recruitment_channel', 'marital_estatus', 
            'join_year', 'join_month', 'performance', 'join_age_group', 'work_modality_boss', 
            'gender_boss', 'recruitment_channel_boss', 'marital_estatus_boss', 'join_year_boss', 'join_month_boss', 
            'performance_boss', 'join_age_group_boss', 'joined_after_boss', 'younger_than_boss']

## Dropping categories according to hypothesis testing

In [4]:
reduced_train_df = train_df.drop(columns=['join_month', 'work_modality_boss', 'gender_boss', 'recruitment_channel_boss', 
                                          'marital_estatus_boss', 'join_year_boss', 'join_month_boss', 'join_age_group_boss', 
                                          'younger_than_boss', 'office_distance', 'average_permanence', 'salary', 
                                          'psi_score', 'join_age', 'office_distance_boss', 'average_permanence_boss', 
                                          'salary_boss', 'psi_score_boss', 'join_age_boss', 'age_diff', 
                                          'avg_od_epb', 'avg_lhd_epb', 'avg_sal_epb', 'avg_psis_epb', 
                                          'boss_employees_in_charge'])

reduced_test_df = test_df.drop(columns=['join_month', 'work_modality_boss', 'gender_boss', 'recruitment_channel_boss', 
                                          'marital_estatus_boss', 'join_year_boss', 'join_month_boss', 'join_age_group_boss', 
                                          'younger_than_boss', 'office_distance', 'average_permanence', 'salary', 
                                          'psi_score', 'join_age', 'office_distance_boss', 'average_permanence_boss', 
                                          'salary_boss', 'psi_score_boss', 'join_age_boss', 'age_diff', 
                                          'avg_od_epb', 'avg_lhd_epb', 'avg_sal_epb', 'avg_psis_epb', 
                                          'boss_employees_in_charge'])

## Preprocessing full data

In [5]:
ohe_full = OneHotEncoder(sparse_output=False, drop='first')
ss_full = StandardScaler()

In [6]:
# Encoding categories

# Train
encoded_full_train = pd.DataFrame(ohe_full.fit_transform(train_df[cat_cols]))
encoded_full_train.columns = ohe_full.get_feature_names_out(cat_cols)

# Test
encoded_full_test = pd.DataFrame(ohe_full.transform(test_df[cat_cols]))
encoded_full_test.columns = ohe_full.get_feature_names_out(cat_cols)

In [7]:
# Scaling numericals

# Train
scaled_full_train = pd.DataFrame(ss_full.fit_transform(train_df[num_cols]), columns=num_cols)

# Test
scaled_full_test = pd.DataFrame(ss_full.transform(test_df[num_cols]), columns=num_cols)

In [8]:
# Concatenating dfs

concat_full_train = pd.concat([encoded_full_train, scaled_full_train], axis=1)

concat_full_test = pd.concat([encoded_full_test, scaled_full_test], axis=1)

### Feature engineering

Autofeat is a library that applies multiple numerical transformations to the features in order to find meaningful relationships between features and target, improving the prediction

In [9]:
y = train_df['resign']

In [10]:
feateng_cols = ['seniority_2',
 'work_modality_Presencial',
 'gender_Mujer',
 'recruitment_channel_Headhunter',
 'recruitment_channel_Linkedin',
 'recruitment_channel_Portal Web',
 'recruitment_channel_Referidos',
 'marital_estatus_Divorciado',
 'marital_estatus_Soltero',
 'marital_estatus_Viudo',
 'performance_low',
 'join_age_group_old_adult',
 'join_age_group_young',
 'work_modality_boss_Presencial',
 'gender_boss_Mujer',
 'recruitment_channel_boss_Headhunter',
 'recruitment_channel_boss_Linkedin',
 'recruitment_channel_boss_Portal Web',
 'recruitment_channel_boss_Referidos',
 'marital_estatus_boss_Divorciado',
 'marital_estatus_boss_Soltero',
 'marital_estatus_boss_Viudo',
 'performance_boss_low',
 'join_age_group_boss_old_adult',
 'join_age_group_boss_young',
 'joined_after_boss_1',
 'younger_than_boss_1',
 'office_distance',
 'low_health_days',
 'average_permanence',
 'salary',
 'performance_score',
 'psi_score',
 'join_age',
 'office_distance_boss',
 'low_health_days_boss',
 'average_permanence_boss',
 'salary_boss',
 'performance_score_boss',
 'psi_score_boss',
 'join_age_boss',
 'salary_diff',
 'join_days_diff',
 'age_diff',
 'boss_employees_in_charge']

In [11]:
autofeat = AutoFeatClassifier(n_jobs=-1, verbose=1, feateng_cols=feateng_cols)

X_full_transformed = autofeat.fit_transform(concat_full_train, y)

2024-10-24 23:17:00,375 INFO: [AutoFeat] The 2 step feature engineering process could generate up to 49770 features.
2024-10-24 23:17:00,376 INFO: [AutoFeat] With 2152 data points this new feature matrix would use about 0.43 gb of space.
2024-10-24 23:17:00,380 INFO: [feateng] Step 1: transformation of original features


[feateng]               0/             45 features transformed

2024-10-24 23:17:07,110 INFO: [feateng] Generated 88 transformed features from 45 original features - done.
2024-10-24 23:17:07,134 INFO: [feateng] Step 2: first combination of features


[feateng]            8400/           8778 feature tuples combined

2024-10-24 23:17:11,926 INFO: [feateng] Generated 8629 feature combinations from 8778 original feature tuples - done.


[feateng]            8700/           8778 feature tuples combined

2024-10-24 23:17:12,021 INFO: [feateng] Generated altogether 8795 new features in 2 steps
2024-10-24 23:17:12,022 INFO: [feateng] Removing correlated features, as well as additions at the highest level
2024-10-24 23:17:12,669 INFO: [feateng] Generated a total of 8166 additional features


[featsel] Scaling data...done.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed: 31.8min
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed: 32.1min remaining: 48.1min
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed: 32.8min remaining: 21.9min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 35.4min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 35.4min finished


2024-10-24 23:52:34,565 INFO: [featsel] 51 features after 5 feature selection runs
2024-10-24 23:52:34,868 INFO: [featsel] 51 features after correlation filtering
2024-10-24 23:52:39,732 INFO: [featsel] 27 features after noise filtering
2024-10-24 23:52:39,736 INFO: [AutoFeat] Computing 22 new features.


[AutoFeat]    21/   22 new features

2024-10-24 23:52:42,727 INFO: [AutoFeat]    22/   22 new features ...done.
2024-10-24 23:52:42,730 INFO: [AutoFeat] Final dataframe with 118 feature columns (22 new).
2024-10-24 23:52:42,731 INFO: [AutoFeat] Training final classification model.
2024-10-24 23:52:43,745 INFO: [AutoFeat] Trained model: largest coefficients:
2024-10-24 23:52:43,746 INFO: [-1.25794388]
2024-10-24 23:52:43,747 INFO: 5.794558 * gender_Mujer*gender_boss_Mujer
2024-10-24 23:52:43,747 INFO: 2.997091 * seniority_2
2024-10-24 23:52:43,748 INFO: 2.298569 * gender_Mujer
2024-10-24 23:52:43,748 INFO: 1.769027 * gender_boss_Mujer
2024-10-24 23:52:43,749 INFO: 1.248439 * join_year_2019
2024-10-24 23:52:43,749 INFO: 0.937848 * seniority_2*Abs(psi_score_boss)
2024-10-24 23:52:43,750 INFO: 0.803051 * gender_Mujer*marital_estatus_Soltero
2024-10-24 23:52:43,750 INFO: 0.760109 * performance_score
2024-10-24 23:52:43,751 INFO: 0.644197 * gender_Mujer*recruitment_channel_PortalWeb
2024-10-24 23:52:43,751 INFO: 0.615844 * work

In [12]:
X_full_transformed_test = autofeat.transform(concat_full_test)

2024-10-24 23:52:43,785 INFO: [AutoFeat] Computing 22 new features.
2024-10-24 23:52:43,806 INFO: [AutoFeat]    22/   22 new features ...done.


[AutoFeat]    21/   22 new features

### Obtaining generated columns

In [13]:
# Obtaining generated columns
generated_cols = [col for col in X_full_transformed.columns.tolist() if col not in concat_full_train.columns.tolist()]
generated_train = X_full_transformed[generated_cols]
generated_test = X_full_transformed_test[generated_cols]

## Saving full data

In [16]:
# Dropping categories and numerical columns

# Train
train_full_df = pd.concat([train_df, generated_train], axis=1)

# Test
test_full_df = pd.concat([test_df, generated_test], axis=1)

# Saving these dataframes

train_full_df.to_csv(paths.data_processed_dir('train_featured.csv'), index=False, sep=',')
test_full_df.to_csv(paths.data_processed_dir('test_featured.csv'), index=False, sep=',')

## Saving reduced data

In [17]:
# Train
train_red_df = pd.concat([reduced_train_df, generated_train], axis=1)

# Test
test_red_df = pd.concat([reduced_test_df, generated_test], axis=1)

# Saving these dataframes

train_red_df.to_csv(paths.data_processed_dir('train_red_featured.csv'), index=False, sep=',')
test_red_df.to_csv(paths.data_processed_dir('test_red_featured.csv'), index=False, sep=',')