# Preprocessing data

Categorical columns will be encoded using One-Hot encoding.

Since distributions are not normal, we gonna use Power Transformer from Scikit-Learn to make them normal-like.

## Preparing environment

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import PowerTransformer, LabelEncoder
from autofeat import AutoFeatClassifier
import sys
sys.path.append('../high_performance_employee_resign_prediction')
from utils import paths

## Importing data

In [2]:
train_df = pd.read_csv(paths.data_interim_dir('train_node.csv'))
test_df = pd.read_csv(paths.data_interim_dir('test_node.csv'))

In [3]:
id_col = ['id_employee_employee']

num_cols = ['office_distance_employee', 'low_health_days_employee', 'average_permanence_employee', 'salary_employee',
            'performance_score_employee', 'psi_score_employee', 'join_age_employee', 'office_distance_boss',
            'low_health_days_boss', 'average_permanence_boss', 'salary_boss', 'performance_score_boss',
            'psi_score_boss', 'join_age_boss', 'office_distance_diff', 'low_health_days_diff', 'average_permanence_diff',
            'salary_diff', 'join_days_diff', 'age_diff', 'avg_od_epb', 'avg_lhd_epb', 'avg_avgp_epb', 'avg_sal_epb',
            'avg_ps_epb', 'avg_psis_epb', 'avg_ja_epb', 'avg_od_bpb', 'avg_lhd_bpb', 'avg_avgp_bpb', 'avg_sal_bpb',
            'avg_ps_bpb', 'avg_psis_bpb', 'avg_ja_bpb', 'boss_employees_in_charge', 'bob_bosses_in_charge']

cat_cols = ['seniority_employee', 'work_modality_employee', 'gender_employee',
            'recruitment_channel_employee', 'marital_estatus_employee', 'performance_employee',
            'work_modality_boss', 'gender_boss', 'recruitment_channel_boss', 'marital_estatus_boss',
            'performance_boss', 'joined_after_boss', 'younger_than_boss']

In [4]:
# Encoding id_last_boss_employee and id_last_boss_boss columns

boss_ids = pd.concat([train_df['id_last_boss_employee'], test_df['id_last_boss_employee']], axis=0).unique().tolist()

label_encoder = LabelEncoder()
label_encoder.fit(boss_ids)

train_df['id_last_boss_employee'] = label_encoder.transform(train_df['id_last_boss_employee'])
train_df['id_last_boss_boss'] = label_encoder.transform(train_df['id_last_boss_boss'])
test_df['id_last_boss_employee'] = label_encoder.transform(test_df['id_last_boss_employee'])
test_df['id_last_boss_boss'] = label_encoder.transform(test_df['id_last_boss_boss'])

In [5]:
# Saving id_employee

train_id = train_df['id_employee_employee']
test_id = train_df['id_employee_employee']

train_df.drop(columns=['id_employee_employee'], inplace=True)
test_df.drop(columns=['id_employee_employee'], inplace=True)

## Converting categorical columns

In [6]:
train_df[cat_cols] = train_df[cat_cols].astype('category')
test_df[cat_cols] = test_df[cat_cols].astype('category')

# Encoding categorical columns

In [7]:
label_enc_full = LabelEncoder()

for col in cat_cols:
    train_df[col] = label_enc_full.fit_transform(train_df[col])
    test_df[col] = label_enc_full.transform(test_df[col])

## Scaling numerical columns

In [8]:
scaler = PowerTransformer(method='yeo-johnson')

train_df.loc[:, num_cols] = scaler.fit_transform(train_df[num_cols])
test_df.loc[:, num_cols] = scaler.transform(test_df[num_cols])

## Getting node features

In [9]:
node_feat_train = train_df.iloc[:, -193:]

node_feat_test = test_df.iloc[:, -192:]

## Getting original features

In [10]:
og_train_df = train_df.iloc[:, :-193]

og_test_df = test_df.iloc[:, :-192]

In [11]:
og_train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2152 entries, 0 to 2151
Data columns (total 55 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   id_last_boss_employee         2152 non-null   int64  
 1   seniority_employee            2152 non-null   int64  
 2   work_modality_employee        2152 non-null   int32  
 3   office_distance_employee      2152 non-null   float64
 4   low_health_days_employee      2152 non-null   float64
 5   gender_employee               2152 non-null   int32  
 6   recruitment_channel_employee  2152 non-null   int32  
 7   average_permanence_employee   2152 non-null   float64
 8   salary_employee               2152 non-null   float64
 9   performance_score_employee    2152 non-null   float64
 10  psi_score_employee            2152 non-null   float64
 11  marital_estatus_employee      2152 non-null   int32  
 12  join_age_employee             2152 non-null   float64
 13  joi

## Dropping features according to hypothesis testing

In [12]:
discard_feat = ['join_month_employee', 'work_modality_boss', 'gender_boss', 'recruitment_channel_boss',
                'marital_estatus_boss', 'join_month_boss', 'performance_boss', 'joined_after_boss',
                'younger_than_boss', 'office_distance_employee', 'average_permanence_employee', 'salary_employee',
                'psi_score_employee', 'join_age_employee', 'office_distance_boss', 'low_health_days_boss',
                'average_permanence_boss', 'salary_boss', 'psi_score_boss', 'join_age_boss',
                'office_distance_diff', 'average_permanence_diff', 'age_diff', 'avg_od_epb',
                'avg_lhd_epb', 'avg_avgp_epb', 'avg_sal_epb', 'avg_psis_epb', 'avg_od_bpb',
                'avg_lhd_bpb', 'avg_avgp_bpb', 'avg_sal_bpb', 'avg_psis_bpb', 'avg_ja_bpb',
                'boss_employees_in_charge', 'bob_bosses_in_charge']

og_reduced_train_df = og_train_df.drop(columns=discard_feat)

og_reduced_test_df = og_test_df.drop(columns=discard_feat)

## Feature engineering on full data

Autofeat is a library that applies multiple numerical transformations to the features in order to find meaningful relationships between features and target, improving the prediction

In [13]:
y = train_df['resign']

In [14]:
autofeat_full = AutoFeatClassifier(feateng_cols=og_train_df.columns.tolist(), n_jobs=-1, verbose=2)

In [15]:
# Applying automatic feature engineering

X_train_full_transf = autofeat_full.fit_transform(og_train_df, y)

2024-07-28 07:32:55,360 INFO: [AutoFeat] The 2 step feature engineering process could generate up to 74305 features.
2024-07-28 07:32:55,362 INFO: [AutoFeat] With 2152 data points this new feature matrix would use about 0.64 gb of space.
2024-07-28 07:32:55,365 INFO: [feateng] Step 1: transformation of original features


[feateng]               0/             55 features transformed

2024-07-28 07:32:57,622 INFO: [feateng] Generated 222 transformed features from 55 original features - done.
2024-07-28 07:32:57,626 INFO: [feateng] Step 2: first combination of features


[feateng]           23700/          38226 feature tuples combined

  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[feateng]           38100/          38226 feature tuples combined

2024-07-28 07:33:15,824 INFO: [feateng] Generated 38062 feature combinations from 38226 original feature tuples - done.


[feateng]           38200/          38226 feature tuples combined

2024-07-28 07:33:16,245 INFO: [feateng] Generated altogether 38293 new features in 2 steps
2024-07-28 07:33:16,246 INFO: [feateng] Removing correlated features, as well as additions at the highest level
2024-07-28 07:33:17,971 INFO: [feateng] Generated a total of 33538 additional features


[featsel] Scaling data...done.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed: 117.4min
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed: 117.5min remaining: 176.3min
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed: 117.6min remaining: 78.4min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 118.6min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 118.6min finished


2024-07-28 09:31:53,619 INFO: [featsel] 46 features after 5 feature selection runs
2024-07-28 09:31:54,101 INFO: [featsel] 46 features after correlation filtering
2024-07-28 09:31:57,537 INFO: [featsel] 29 features after noise filtering
2024-07-28 09:31:57,547 INFO: [AutoFeat] Computing 27 new features.


[AutoFeat]    24/   27 new features

2024-07-28 09:32:00,517 INFO: [AutoFeat]    27/   27 new features ...done.
2024-07-28 09:32:00,522 INFO: [AutoFeat] Final dataframe with 82 feature columns (27 new).
2024-07-28 09:32:00,523 INFO: [AutoFeat] Training final classification model.


[AutoFeat]    26/   27 new features

2024-07-28 09:32:01,327 INFO: [AutoFeat] Trained model: largest coefficients:
2024-07-28 09:32:01,328 INFO: [-1.33402973]
2024-07-28 09:32:01,329 INFO: 5.346941 * gender_boss*gender_employee
2024-07-28 09:32:01,329 INFO: 1.511400 * sqrt(marital_estatus_employee)*seniority_employee
2024-07-28 09:32:01,329 INFO: 1.487994 * seniority_employee*exp(avg_lhd_bpb)
2024-07-28 09:32:01,330 INFO: 0.876908 * gender_boss
2024-07-28 09:32:01,330 INFO: 0.775278 * performance_employee*performance_score_employee
2024-07-28 09:32:01,331 INFO: 0.757423 * gender_employee*performance_employee
2024-07-28 09:32:01,332 INFO: 0.673955 * gender_employee
2024-07-28 09:32:01,333 INFO: 0.474227 * gender_boss*Abs(join_age_boss)
2024-07-28 09:32:01,333 INFO: 0.446958 * gender_employee*sqrt(recruitment_channel_employee)
2024-07-28 09:32:01,333 INFO: 0.376649 * sqrt(marital_estatus_employee)*Abs(performance_score_boss)
2024-07-28 09:32:01,334 INFO: 0.373801 * gender_employee*Abs(salary_employee)
2024-07-28 09:32:01,33

In [22]:
X_test_full_transf = autofeat_full.transform(og_test_df)

2024-07-28 12:47:25,851 INFO: [AutoFeat] Computing 27 new features.
2024-07-28 12:47:25,871 INFO: [AutoFeat]    27/   27 new features ...done.


[AutoFeat]    26/   27 new features

## Feature engineering on reduced data

In [17]:
autofeat_red = AutoFeatClassifier(feateng_cols=og_reduced_train_df.columns.tolist(), n_jobs=-1, verbose=2)

In [18]:
# Applying automatic feature engineering

X_train_red_transf = autofeat_red.fit_transform(og_reduced_train_df, y)

2024-07-28 09:32:01,408 INFO: [AutoFeat] The 2 step feature engineering process could generate up to 8911 features.
2024-07-28 09:32:01,408 INFO: [AutoFeat] With 2152 data points this new feature matrix would use about 0.08 gb of space.
2024-07-28 09:32:01,409 INFO: [feateng] Step 1: transformation of original features


[feateng]               0/             19 features transformed

2024-07-28 09:32:03,004 INFO: [feateng] Generated 69 transformed features from 19 original features - done.
2024-07-28 09:32:03,007 INFO: [feateng] Step 2: first combination of features


[feateng]            3000/           3828 feature tuples combined

  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[feateng]            3500/           3828 feature tuples combined

2024-07-28 09:32:04,943 INFO: [feateng] Generated 3783 feature combinations from 3828 original feature tuples - done.
2024-07-28 09:32:04,979 INFO: [feateng] Generated altogether 3858 new features in 2 steps
2024-07-28 09:32:04,980 INFO: [feateng] Removing correlated features, as well as additions at the highest level


[feateng]            3800/           3828 feature tuples combined

2024-07-28 09:32:05,130 INFO: [feateng] Generated a total of 2717 additional features


[featsel] Scaling data...done.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  9.9min
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed: 10.3min remaining: 15.5min
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed: 10.4min remaining:  6.9min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 10.7min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 10.7min finished


2024-07-28 09:42:45,614 INFO: [featsel] 36 features after 5 feature selection runs
2024-07-28 09:42:45,623 INFO: [featsel] 36 features after correlation filtering
2024-07-28 09:42:49,269 INFO: [featsel] 27 features after noise filtering
2024-07-28 09:42:49,271 INFO: [AutoFeat] Computing 25 new features.


[AutoFeat]    22/   25 new features

2024-07-28 09:42:52,233 INFO: [AutoFeat]    25/   25 new features ...done.
2024-07-28 09:42:52,236 INFO: [AutoFeat] Final dataframe with 44 feature columns (25 new).
2024-07-28 09:42:52,237 INFO: [AutoFeat] Training final classification model.


[AutoFeat]    24/   25 new features

2024-07-28 09:42:52,315 INFO: [AutoFeat] Trained model: largest coefficients:
2024-07-28 09:42:52,316 INFO: [-1.97994311e-06]
2024-07-28 09:42:52,317 INFO: 0.001621 * exp(performance_score_employee)*exp(recruitment_channel_employee)
2024-07-28 09:42:52,318 INFO: 0.000856 * marital_estatus_employee**3*exp(marital_estatus_employee)
2024-07-28 09:42:52,318 INFO: 0.000648 * avg_ps_epb**3*marital_estatus_employee**3
2024-07-28 09:42:52,319 INFO: 0.000293 * work_modality_employee*exp(recruitment_channel_employee)
2024-07-28 09:42:52,319 INFO: 0.000130 * gender_employee/performance_score_boss
2024-07-28 09:42:52,319 INFO: 0.000077 * performance_score_employee*exp(performance_score_boss)
2024-07-28 09:42:52,320 INFO: 0.000075 * sqrt(id_last_boss_employee)*sqrt(recruitment_channel_employee)
2024-07-28 09:42:52,320 INFO: 0.000070 * low_health_days_diff/performance_score_employee
2024-07-28 09:42:52,321 INFO: 0.000059 * Abs(avg_ja_epb)/low_health_days_employee
2024-07-28 09:42:52,322 INFO: 0.0000

In [27]:
X_test_red_transf = autofeat_red.transform(og_reduced_test_df)

2024-07-28 12:49:05,768 INFO: [AutoFeat] Computing 25 new features.
2024-07-28 12:49:05,790 INFO: [AutoFeat]    25/   25 new features ...done.


[AutoFeat]    24/   25 new features

## Saving data

In [34]:
# Saving full data

full_train_df = pd.concat([train_id, X_train_full_transf, node_feat_train], axis=1)
full_test_df = pd.concat([test_id, X_test_full_transf, node_feat_test], axis=1)

full_train_df.to_csv(paths.data_processed_dir('train_processed.csv'), index=False, sep=',')
full_test_df.to_csv(paths.data_processed_dir('test_processed.csv'), index=False, sep=',')

In [35]:
# Saving reduced data

red_train_df = pd.concat([train_id, X_train_red_transf, node_feat_train], axis=1)
red_test_df = pd.concat([test_id, X_test_red_transf, node_feat_test], axis=1)

red_train_df.to_csv(paths.data_processed_dir('train_reduced_processed.csv'), index=False, sep=',')
red_test_df.to_csv(paths.data_processed_dir('test_reduced_processed.csv'), index=False, sep=',')

In [48]:
# Saving reduced data but with autofeat generated columns with full data

full_red_train_df = pd.concat([train_id, og_reduced_train_df, X_train_full_transf.iloc[:, -27:], node_feat_train], axis=1)
full_red_test_df = pd.concat([test_id, og_reduced_test_df, X_test_full_transf.iloc[:, -27:], node_feat_test], axis=1)

full_red_train_df.to_csv(paths.data_processed_dir('train_full_red_processed.csv'), index=False, sep=',')
full_red_test_df.to_csv(paths.data_processed_dir('test_full_red_processed.csv'), index=False, sep=',')