# Preprocessing data

Categorical columns will be encoded using One-Hot encoding.

Since distributions are not normal, we gonna use Power Transformer from Scikit-Learn to make them normal-like.

## Preparing environment

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, PowerTransformer
import sys
sys.path.append('../high_performance_employee_resign_prediction')
from utils import paths

## Importing data

In [2]:
train_df = pd.read_csv(paths.data_interim_dir('train_node.csv'))
test_df = pd.read_csv(paths.data_interim_dir('test_node.csv'))

In [6]:
num_cols = ['distancia_oficina', 'dias_baja_salud', 'permanencia_promedio', 'salario', 'performance_score', 'psi_score', 'age']
cat_cols = ['seniority', 'modalidad_trabajo', 'genero', 'canal_reclutamiento', 'estado_civil', 'join_year', 'join_month', 'performance']

## Dropping categories according to hypothesis testing

In [3]:
reduced_train_df = train_df.drop(columns=['join_month', 'distancia_oficina', 'permanencia_promedio', 'salario', 'psi_score', 'age'])
reduced_test_df = test_df.drop(columns=['join_month', 'distancia_oficina', 'permanencia_promedio', 'salario', 'psi_score', 'age'])

In [7]:
reduced_num_cols = ['dias_baja_salud', 'performance_score']
reduced_cat_cols = ['seniority', 'modalidad_trabajo', 'genero', 'canal_reclutamiento', 'estado_civil', 'join_year', 'performance']

## Preprocessing full data

In [9]:
ohe_full = OneHotEncoder(sparse_output=False, drop='first')
pt_full = PowerTransformer()

In [13]:
# Encoding categories

# Train
encoded_full_train = pd.DataFrame(ohe_full.fit_transform(train_df[cat_cols]))
encoded_full_train.columns = ohe_full.get_feature_names_out(cat_cols)

# Test
encoded_full_test = pd.DataFrame(ohe_full.transform(test_df[cat_cols]))
encoded_full_test.columns = ohe_full.get_feature_names_out(cat_cols)

In [18]:
# Scaling categories

# Train
scaled_full_train = pd.DataFrame(pt_full.fit_transform(train_df[num_cols]), columns=num_cols)

# Test
scaled_full_test = pd.DataFrame(pt_full.transform(test_df[num_cols]), columns=num_cols)

### Joining full data

In [24]:
# Dropping categories and numerical columns

# Train
train_drop_df = train_df.drop(columns=cat_cols)
train_drop_df = train_drop_df.drop(columns=num_cols)

# Test
test_drop_df = test_df.drop(columns=cat_cols)
test_drop_df = test_drop_df.drop(columns=num_cols)

# Concatenating encoded and scaled columns

# Train
train_full_df = pd.concat([train_drop_df, encoded_full_train, scaled_full_train], axis=1)

# Test
test_full_df = pd.concat([test_drop_df, encoded_full_test, scaled_full_test], axis=1)

## Saving full data

In [27]:
train_full_df.to_csv(paths.data_processed_dir('train_processed.csv'), index=False, sep=',')
test_full_df.to_csv(paths.data_processed_dir('test_processed.csv'), index=False, sep=',')

## Preprocessing reduced data

In [28]:
ohe_reduced = OneHotEncoder(sparse_output=False, drop='first')
pt_reduced = PowerTransformer()

In [29]:
# Encoding categories

# Train
encoded_reduced_train = pd.DataFrame(ohe_reduced.fit_transform(train_df[reduced_cat_cols]))
encoded_reduced_train.columns = ohe_reduced.get_feature_names_out(reduced_cat_cols)

# Test
encoded_reduced_test = pd.DataFrame(ohe_reduced.transform(test_df[reduced_cat_cols]))
encoded_reduced_test.columns = ohe_reduced.get_feature_names_out(reduced_cat_cols)

In [34]:
# Scaling categories

# Train
scaled_reduced_train = pd.DataFrame(pt_reduced.fit_transform(train_df[reduced_num_cols]), columns=reduced_num_cols)

# Test
scaled_reduced_test = pd.DataFrame(pt_reduced.transform(test_df[reduced_num_cols]), columns=reduced_num_cols)

### Joining reduced data

In [39]:
# Concatenating encoded and scaled columns

# Train
train_reduced_df = pd.concat([train_drop_df, encoded_reduced_train, scaled_reduced_train], axis=1)

# Test
test_reduced_df = pd.concat([test_drop_df, encoded_reduced_test, scaled_reduced_test], axis=1)

## Saving reduced data

In [45]:
train_reduced_df.to_csv(paths.data_processed_dir('train_reduced_processed.csv'), index=False, sep=',')
test_reduced_df.to_csv(paths.data_processed_dir('test_reduced_processed.csv'), index=False, sep=',')