# Preprocessing data

Categorical columns will be encoded using One-Hot encoding because Graph Neural Networks benefit from explicit binary representations of categories. Distributions will be normalized using Standard Scaler to ensure they have a similar scale.

## Preparing environment

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import sys
sys.path.append('../high_performance_employee_resign_prediction')
from utils import paths

## Importing data

In [2]:
train_df = pd.read_csv(paths.data_interim_dir('train_clean.csv'))
test_df = pd.read_csv(paths.data_interim_dir('test_clean.csv'))

# Preprocessing total data

In [3]:
# Selecting numerical and categorical columns

num_cols = ['office_distance', 'low_health_days', 'average_permanence', 'salary', 'performance_score', 
            'psi_score', 'join_age', 'office_distance_boss', 'low_health_days_boss', 'average_permanence_boss', 
            'salary_boss', 'performance_score_boss', 'psi_score_boss', 'join_age_boss', 'salary_diff',
            'join_days_diff', 'age_diff', 'avg_od_epb', 'avg_lhd_epb', 'avg_avgp_epb', 
            'avg_sal_epb', 'avg_ps_epb', 'avg_psis_epb', 'avg_ja_epb', 'boss_employees_in_charge']

cat_cols = ['seniority', 'work_modality', 'gender', 'recruitment_channel', 'marital_estatus', 
            'join_year', 'join_month', 'performance', 'join_age_group', 'work_modality_boss', 
            'gender_boss', 'recruitment_channel_boss', 'marital_estatus_boss', 'join_year_boss', 'join_month_boss', 
            'performance_boss', 'join_age_group_boss', 'joined_after_boss', 'younger_than_boss']

In [12]:
# Encoding categorical columns

full_le_encoder = OneHotEncoder(sparse_output=False, drop='first')
encoded_full_cats_train = pd.DataFrame(full_le_encoder.fit_transform(train_df[cat_cols]))
encoded_full_cats_train.columns = full_le_encoder.get_feature_names_out()
encoded_full_cats_test = pd.DataFrame(full_le_encoder.transform(test_df[cat_cols]))
encoded_full_cats_test.columns = full_le_encoder.get_feature_names_out()

# Concatenating encoded columns to original data

processed_full_train = pd.concat([train_df.drop(columns=cat_cols), encoded_full_cats_train], axis=1)
processed_full_test = pd.concat([test_df.drop(columns=cat_cols), encoded_full_cats_test], axis=1)

# Standarizing numerical columns

full_standard_scaler = StandardScaler()
processed_full_train[num_cols] = full_standard_scaler.fit_transform(train_df[num_cols])
processed_full_test[num_cols] = full_standard_scaler.transform(test_df[num_cols])

In [15]:
# Saving full processed data

processed_full_train.to_csv(paths.data_processed_dir('train_full_processed.csv'), index=False)
processed_full_test.to_csv(paths.data_processed_dir('test_full_processed.csv'), index=False)

# Preprocessing reduced data according to hypothesis testing



In [16]:
discard_cols = ['join_month', 'work_modality_boss', 'gender_boss', 'recruitment_channel_boss', 'marital_estatus_boss', 
                'join_year_boss', 'join_month_boss', 'join_age_group_boss', 'younger_than_boss', 'office_distance', 
                'average_permanence', 'salary', 'psi_score', 'join_age', 'office_distance_boss', 
                'average_permanence_boss', 'salary_boss', 'psi_score_boss', 'join_age_boss', 'age_diff', 
                'avg_od_epb', 'avg_lhd_epb', 'avg_sal_epb', 'avg_psis_epb', 'boss_employees_in_charge']

# Discarding features

train_red_df = train_df.drop(columns=discard_cols)
test_red_df = test_df.drop(columns=discard_cols)

In [22]:
# Getting the reduced numerical and categorical columns

red_num_cols = [col for col in num_cols if col not in discard_cols]
red_cat_cols = [col for col in cat_cols if col not in discard_cols]

In [24]:
# Encoding categorical columns

red_le_encoder = OneHotEncoder(sparse_output=False, drop='first')
encoded_red_cats_train = pd.DataFrame(red_le_encoder.fit_transform(train_red_df[red_cat_cols]))
encoded_red_cats_train.columns = red_le_encoder.get_feature_names_out()
encoded_red_cats_test = pd.DataFrame(red_le_encoder.transform(test_red_df[red_cat_cols]))
encoded_red_cats_test.columns = red_le_encoder.get_feature_names_out()

# Concatenating encoded columns to original reduced data

processed_red_train = pd.concat([train_red_df.drop(columns=red_cat_cols), encoded_red_cats_train], axis=1)
processed_red_test = pd.concat([test_red_df.drop(columns=red_cat_cols), encoded_red_cats_test], axis=1)

# Standarizing numerical columns

red_standard_scaler = StandardScaler()
processed_red_train[red_num_cols] = red_standard_scaler.fit_transform(train_red_df[red_num_cols])
processed_red_test[red_num_cols] = red_standard_scaler.transform(test_red_df[red_num_cols])

In [26]:
# Saving reduced processed data

processed_red_train.to_csv(paths.data_processed_dir('train_red_processed.csv'), index=False)
processed_red_test.to_csv(paths.data_processed_dir('test_red_processed.csv'), index=False)

With this preprocessing made, we can make 2 experiments: Modelling on full data and modelling on reduced data, in order to find which approach works better for the GNN.